Assignment 10A

Assignment 10 A

We’ve reproduced the example code for sentiment analysis from chapter two of Text Mining With R. Using the sotu library (filled with state of the union speeches up until 2020), we will break up the text and perform a similar sentiment analysis.

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janeaustenr)
Warning: package 'janeaustenr' was built under R version 4.5.3
library(tidytext)
Warning: package 'tidytext' was built under R version 4.5.3
library(textdata)
Warning: package 'textdata' was built under R version 4.5.3
library(wordcloud)
Warning: package 'wordcloud' was built under R version 4.5.3
Loading required package: RColorBrewer
library(tidyr)
library(reshape2)
Warning: package 'reshape2' was built under R version 4.5.2

Attaching package: 'reshape2'

The following object is masked from 'package:tidyr':

    smiths
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)



#get_sentiments("afinn")
#get_sentiments("bing")
#get_sentiments("nrc")
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
# A tibble: 301 × 2
   word          n
   <chr>     <int>
 1 good        359
 2 friend      166
 3 hope        143
 4 happy       125
 5 love        117
 6 deal         92
 7 found        92
 8 present      89
 9 kind         82
10 happiness    76
# ℹ 291 more rows
http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip' Content type 'application/zip' length 25878449 bytes (24.7 MB)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
# A tibble: 122,204 × 4
   book              linenumber chapter word     
   <fct>                  <int>   <int> <chr>    
 1 Pride & Prejudice          1       0 pride    
 2 Pride & Prejudice          1       0 and      
 3 Pride & Prejudice          1       0 prejudice
 4 Pride & Prejudice          3       0 by       
 5 Pride & Prejudice          3       0 jane     
 6 Pride & Prejudice          3       0 austen   
 7 Pride & Prejudice          7       1 chapter  
 8 Pride & Prejudice          7       1 1        
 9 Pride & Prejudice         10       1 it       
10 Pride & Prejudice         10       1 is       
# ℹ 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 215 of `x` matches multiple rows in `y`.
ℹ Row 5178 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
# A tibble: 2 × 2
  sentiment     n
  <chr>     <int>
1 negative   3316
2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
# A tibble: 2 × 2
  sentiment     n
  <chr>     <int>
1 negative   4781
2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
bing_word_counts
# A tibble: 2,585 × 3
   word     sentiment     n
   <chr>    <chr>     <int>
 1 miss     negative   1855
 2 well     positive   1523
 3 good     positive   1380
 4 great    positive    981
 5 like     positive    725
 6 better   positive    639
 7 enough   positive    613
 8 happy    positive    534
 9 love     positive    495
10 pleasure positive    462
# ℹ 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

I skipped the part where he adds the word “miss” to stop words.

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]
[1] "by jane austen"
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
# A tibble: 6 × 2
  book                chapters
  <fct>                  <int>
1 Sense & Sensibility       51
2 Pride & Prejudice         62
3 Mansfield Park            49
4 Emma                      56
5 Northanger Abbey          32
6 Persuasion                25
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
`summarise()` has grouped output by 'book'. You can override using the
`.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
Joining with `by = join_by(word)`
`summarise()` has grouped output by 'book'. You can override using the
`.groups` argument.
# A tibble: 6 × 5
  book                chapter negativewords words  ratio
  <fct>                 <int>         <int> <int>  <dbl>
1 Sense & Sensibility      43           161  3405 0.0473
2 Pride & Prejudice        34           111  2104 0.0528
3 Mansfield Park           46           173  3685 0.0469
4 Emma                     15           151  3340 0.0452
5 Northanger Abbey         21           149  2982 0.0500
6 Persuasion                4            62  1807 0.0343

Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. O’Reilly Media.

Sentiment Analysis With the sotu Package

The sotu package contains state of the union addresses.

library(sotu)
Warning: package 'sotu' was built under R version 4.5.2
data(sotu_text)

#make a tidy data frame
sotu_df <- tibble(
  president = sotu_meta$president,
  year = sotu_meta$year,
  text = sotu_text
)

#unnest
tidy_sotu <- sotu_df %>%
  unnest_tokens(word, text)

Choosing three sets of presidential speeches

There are hundreds of state of the union addresses. For this exercise, we’ll look at three presidents: Abraham Lincoln, Franklin D. Roosevelt, and Barack Obama.

obama <- sotu_df |>
  filter (president == "Barack Obama")

d_roosevelt <- sotu_df |>
  filter (president == "Franklin D. Roosevelt")

lincoln <- sotu_df |>
  filter (president == "Abraham Lincoln")

Unnesting the text

#Obama
tidy_obama <- obama %>%
  group_by(year) %>%
  mutate(
    linenumber = row_number(),
    speech_id = cur_group_id() 
  ) %>%
  ungroup() %>%
    unnest_tokens(word, text)
    

#Roosevelt
tidy_roosevelt <- d_roosevelt %>%
  group_by(year) %>%
  mutate(
    linenumber = row_number(),
    speech_id = cur_group_id() 
  ) %>%
  ungroup() %>%
    unnest_tokens(word, text)

#Lincoln

tidy_lincoln <- lincoln %>%
  group_by(year) %>%
  mutate(
    linenumber = row_number(),
    speech_id = cur_group_id() 
  ) %>%
  ungroup() %>%
    unnest_tokens(word, text)

Bing net sentiment over the years

Instead of grouping by chapters, we’ll group by year, and show overall sentiments together, so we can see how it changed over a presidency.

obama_sentiment <- tidy_obama %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = linenumber, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
ggplot(obama_sentiment, aes(x = year, y = sentiment, fill = year)) +
  geom_col(show.legend = FALSE) + 
  scale_x_continuous(breaks = seq(min(obama_sentiment$year), max(obama_sentiment$year), by = 1)) +
  labs (title = "Obama's net sentiment over time")

2009 and 2010 speeches were not very positive, possibly due to the global economic crisis. After the midterms (which the democrats lost big time), he became more positive, and then was more negative in the latter half of his second term.

for FDR:

FDR had more terms, and therefore more states of the union, than any other president.

fdr_sentiment <- tidy_roosevelt %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = linenumber, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
ggplot(fdr_sentiment, aes(x = year, y = sentiment, fill = year)) +
  geom_col(show.legend = FALSE) + 
  scale_x_continuous(breaks = seq(min(fdr_sentiment$year), max(fdr_sentiment$year), by = 1)) +
  labs (title = "FDR's net sentiment over time")

There are net negatives in 1938 and 1942, and a big positivity spike in 1945. Unfortunately, we have no way of telling what was happening.

Lincoln:

lincoln_sentiment <- tidy_lincoln %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = linenumber, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
ggplot(lincoln_sentiment, aes(x = year, y = sentiment, fill = year)) +
  geom_col(show.legend = FALSE) + 
  scale_x_continuous(breaks = seq(min(lincoln_sentiment$year), max(lincoln_sentiment$year), by = 1)) +
  labs (title = "Lincoln's net sentiment over time")

Lincoln’s net sentiment shows as generally very positive, with a small dip in 1863.

Joy Words

What “joy” words did Obama use most?

tidy_obama %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
# A tibble: 170 × 2
   word         n
   <chr>    <int>
 1 good        54
 2 laughter    48
 3 pay         41
 4 clean       37
 5 money       37
 6 create      35
 7 vote        34
 8 finally     32
 9 progress    29
10 save        29
# ℹ 160 more rows

Apparently “pay,” “clean,” and “vote” are joy words. The other words seem like they belong, but 48 instances of the word “laughter” is notable.

Creating a Comparison for FDR’s Speeches

#FDR adn AFINN
afinn_fdr <- tidy_roosevelt %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(year) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
Joining with `by = join_by(word)`
bing_and_nrc_fdr <- bind_rows(
  tidy_roosevelt %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  tidy_roosevelt %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
    count(method, year, sentiment) %>%
    pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
    mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1063 of `x` matches multiple rows in `y`.
ℹ Row 4872 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
fdr_compiled <- bind_rows(afinn_fdr, 
          bing_and_nrc_fdr)

fdr_compiled |> 
  ggplot(aes(year, sentiment, fill = year)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~method, ncol = 1, scales = "free_y") +
    scale_x_continuous(breaks = seq(min(fdr_compiled$year), max(fdr_compiled$year), by = 1)) +
    labs(title = "Comparing Afinn, Bing, and NRC using FDR's SOTUs")

Interestingly, between Bing and AFINN, the net positive years look somewhat similar. However, the negative years look a lot more negative with AFINN. NRC registers everything as positive. There’s a lot of variation in the way 1944 is scored.

Most Common Positive and Negative Words

Bing (we’ll do it for all three sets of speeches here):

bing_word_counts_fdr <- tidy_roosevelt %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
bing_word_counts_fdr
# A tibble: 991 × 3
   word     sentiment     n
   <chr>    <chr>     <int>
 1 peace    positive    141
 2 great    positive     97
 3 work     positive     86
 4 well     positive     60
 5 good     positive     57
 6 right    positive     54
 7 freedom  positive     53
 8 problems negative     47
 9 attack   negative     42
10 faith    positive     33
# ℹ 981 more rows
bing_word_counts_obama <- tidy_obama %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
bing_word_counts_obama
# A tibble: 809 × 3
   word    sentiment     n
   <chr>   <chr>     <int>
 1 work    positive    158
 2 like    positive    116
 3 right   positive    111
 4 hard    negative     66
 5 better  positive     58
 6 reform  positive     55
 7 good    positive     54
 8 support positive     52
 9 best    positive     43
10 clean   positive     37
# ℹ 799 more rows
bing_word_counts_lincoln <- tidy_lincoln %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
bing_word_counts_lincoln
# A tibble: 617 × 3
   word      sentiment     n
   <chr>     <chr>     <int>
 1 great     positive     61
 2 well      positive     29
 3 free      positive     27
 4 debt      negative     23
 5 good      positive     19
 6 proper    positive     18
 7 slave     negative     17
 8 important positive     16
 9 peace     positive     15
10 best      positive     14
# ℹ 607 more rows

Graphing Positive and Negative Word Counts

#FDR
bing_word_counts_fdr %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment (FDR)",
       y = NULL)

#Obama
bing_word_counts_obama %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment (Obama)",
       y = NULL)

#Lincoln
bing_word_counts_lincoln %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment (Lincoln)",
       y = NULL)

“Great” shows up a lot in FDR’s positive words. Was he talking about the Great Depression or the Great War?

A quick search revealed that people did call it the “Great Depression” while it was happening, but FDR did not use the term in his speeches. In 1939, he used the term “great unemployment of capital” to refer to the national debt, which was apparently not bad.

Presidential Word Clouds

#specifying the wordcloud package for these
tidy_obama %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud::wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

tidy_roosevelt %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud::wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

tidy_lincoln %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud::wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

Positive / negative word clouds

tidy_obama %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`

tidy_roosevelt %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`

tidy_lincoln %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`

Sentences

Breaking Obama’s speeches down into sentences.

obama_sentences <- obama %>% 
  unnest_tokens(sentence, text, token = "sentences")

obama_sentences$sentence[2]
[1] "vice president, members of congress, the first lady of the united states--she's around here somewhere: i have come here tonight not only to address the distinguished men and women in this great chamber, but to speak frankly and directly to the men and women who sent us here."

This is a sentence fragment because the first sentence ends with “mr.”

obama_sentences$sentence[1]
[1] "madam speaker, mr."
fdr_sentences <- d_roosevelt %>% 
  unnest_tokens(sentence, text, token = "sentences")

lincoln_sentences <- lincoln %>% 
  unnest_tokens(sentence, text, token = "sentences")

Who had the shortest/longest sentences?

wordcounts_obama <- obama_sentences %>%
  mutate(words = str_count(sentence, "\\S+"))


wordcounts_fdr <- fdr_sentences %>%
  mutate(words = str_count(sentence, "\\S+"))

wordcounts_lincoln <- lincoln_sentences %>%
  mutate(words = str_count(sentence, "\\S+"))

Calculating the Mean by Speech

avg_sentence_obama <- wordcounts_obama |> group_by(year) |>
  summarise(avg = mean(words))

avg_sentence_obama |> 
  ggplot(aes(year, avg, fill = year)) +
    geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(avg_sentence_obama$year), max(avg_sentence_obama$year), by = 1)) +
    labs(title = "Average words per sentence (Obama)")

avg_sentence_fdr <- wordcounts_fdr |> group_by(year) |>
  summarise(avg = mean(words))

avg_sentence_fdr |> 
  ggplot(aes(year, avg, fill = year)) +
    geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(avg_sentence_fdr$year), max(avg_sentence_fdr$year), by = 1)) +
    labs(title = "Average words per sentence (FDR)")

avg_sentence_lincoln <- wordcounts_lincoln |> group_by(year) |>
  summarise(avg = mean(words))

avg_sentence_lincoln |> 
  ggplot(aes(year, avg, fill = year)) +
    geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(avg_sentence_lincoln$year), max(avg_sentence_lincoln$year), by = 1)) +
    labs(title = "Average words per sentence (Lincoln)")

Obama had the shortest words per sentence. Could this have been a symptom of changing times/attention spans?

Speech Lengths

We can’t unnest by chapter. Instead, let’s see which speech was the longest.

obama_speech_length <- tidy_obama |> count(year, name = "word_count")

obama_speech_length |> ggplot(aes(x = year, y = word_count, fill = year)) +
  geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(obama_speech_length$year), max(obama_speech_length$year), by = 1)) +
    labs(title = "Obama's SOTU speech lengths in words")

fdr_speech_length <- tidy_roosevelt |> count(year, name = "word_count")

fdr_speech_length |> ggplot(aes(x = year, y = word_count, fill = year)) +
  geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(fdr_speech_length$year), max(fdr_speech_length$year), by = 1)) +
    labs(title = "FDR's SOTU speech lengths in words")

lincoln_speech_length <- tidy_lincoln |> count(year, name = "word_count")

lincoln_speech_length |> ggplot(aes(x = year, y = word_count, fill = year)) +
  geom_col(show.legend = FALSE) +
    scale_x_continuous(breaks = seq(min(lincoln_speech_length$year), max(lincoln_speech_length$year), by = 1)) +
    labs(title = "Lincoln's SOTU speech lengths in words")

FDR’s speeches were the shortest in general, with one long speech in 1945.

Quantifying Presidential Negativity

Quantifying the negativity ratio in each of Obama’s SOTUs:

tidy_obama %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(obama_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 8 × 4
   year negativewords word_count  ratio
  <int>         <int>      <int>  <dbl>
1  2009           175       6092 0.0287
2  2010           222       7263 0.0306
3  2011           104       6909 0.0151
4  2012           152       7058 0.0215
5  2013           125       6851 0.0182
6  2014           137       7064 0.0194
7  2015           167       6797 0.0246
8  2016           153       6078 0.0252
#the most negative year
tidy_obama %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(obama_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 1 × 4
   year negativewords word_count  ratio
  <int>         <int>      <int>  <dbl>
1  2010           222       7263 0.0306

For FDR:

tidy_roosevelt %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(fdr_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 12 × 4
    year negativewords word_count  ratio
   <int>         <int>      <int>  <dbl>
 1  1934            68       2233 0.0305
 2  1935           105       3536 0.0297
 3  1936           116       3835 0.0302
 4  1937            69       2743 0.0252
 5  1938           159       4716 0.0337
 6  1939           117       3785 0.0309
 7  1940            98       3218 0.0305
 8  1941            92       3332 0.0276
 9  1942           116       3523 0.0329
10  1943           150       4618 0.0325
11  1944           104       3842 0.0271
12  1945           330      11364 0.0290
#the most negative year
tidy_roosevelt %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(fdr_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 1 × 4
   year negativewords word_count  ratio
  <int>         <int>      <int>  <dbl>
1  1938           159       4716 0.0337
tidy_lincoln %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(lincoln_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 4 × 4
   year negativewords word_count  ratio
  <int>         <int>      <int>  <dbl>
1  1861           169       6998 0.0241
2  1862           194       8410 0.0231
3  1863           128       6132 0.0209
4  1864           116       5975 0.0194
#the most negative year
tidy_lincoln %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  summarize(negativewords = n()) %>%
  left_join(lincoln_speech_length, by = "year") %>%
  mutate(ratio = negativewords/word_count) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
Joining with `by = join_by(word)`
# A tibble: 1 × 4
   year negativewords word_count  ratio
  <int>         <int>      <int>  <dbl>
1  1861           169       6998 0.0241

Lincoln was the least negative president. (Read: Lincoln’s speeches hat the fewest ratio of negative words, as per the Bing scale).

Finding a Better Word Cloud

let’s try Wordcloud2:

library(wordcloud2)
Warning: package 'wordcloud2' was built under R version 4.5.3
#demo here: wordcloud2(data = demoFreq)

obama_frequency <- tidy_obama %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
wordcloud2(data = obama_frequency)
fdr_frequency <- tidy_roosevelt %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
wordcloud2(data = fdr_frequency)

ggplot Word Cloud

ggplot includes a word cloud function. You can add color and tilt the words. If you don’t limit the number of words displayed, they’ll just stack on top of each other. It also looks kind of sparse.

A demo on rpubs here: https://rpubs.com/mgei/1259234, which this code was pulled from:

library(ggwordcloud)

lincoln_frequency <- tidy_lincoln %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
ggplot(lincoln_frequency |> head(35) |> 
         mutate(first_letter = str_sub(word, 1, 1) |> str_to_lower()), 
       aes(label = word, size = n, color = word)) +
  geom_text_wordcloud(area_corr = TRUE) +
  scale_size_area(max_size = 40) +
  theme_minimal()

Modelwordcloud

Again, taken from this demo:

https://rpubs.com/mgei/1259234

library(modelwordcloud)

Attaching package: 'modelwordcloud'
The following object is masked from 'package:wordcloud':

    wordcloud
modelwordcloud::wordcloud(words = obama_frequency[1:30, "word"][[1]], 
                          freq = obama_frequency[1:30, "n"][[1]])

#adding random color
modelwordcloud::wordcloud(words = obama_frequency[1:50, "word"][[1]], 
          freq = obama_frequency[1:50, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE)

modelwordcloud::wordcloud(words = obama_frequency[1:80, "word"][[1]], 
          freq = obama_frequency[1:80, "n"][[1]],
          colors = RColorBrewer::brewer.pal(8, "Accent"),
          random_color = TRUE)

PubMedWordCloud

This code was also pulled/modified from the demo:

library(PubMedWordcloud)

plotWordCloud(obama_frequency |> rename(freq = n))

plotWordCloud(obama_frequency |> rename(freq = n), 
              scale = c(3, 0.3), min.freq = 1, max.words = 200,
              random.order = FALSE, rot.per = 0.1, use.r.layout = T,
              colors = brewer.pal(8, "Accent"))

Analysis

Overall, wordcloud2 is the most aesthetically pleasing and T-shirt ready, but it can be overwhelming if you’re just trying to understand the tone. PubMed seems like a more organized and straightforward way to get an idea of the quantitative data you’re working with. ggword cloud looked a lot better in the demo cited, but it’s having problems rendering, as is modelwordcloud.

It’s also interesting to see that the tone and language of speeches from another time represented the zeitgeist.