I tried forking the repository to get the data however when I ran the codes they were very ahrd to run as the dependencies seem outdate4d so I essentially copied the main functions needed
Reference for expenditure: John F Kennedy’s Inaugural Address : https://www.gutenberg.org/cache/epub/3/pg3-images.html Reference for repository: https://github.com/dgrtwo/tidy-text-mining/blob/5da7251f492c365f7b7e994ac3b5d642e7240194/02-sentiment-analysis.Rmd
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.2.2
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.2.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.2.2
library(wordcloud)
## Loading required package: RColorBrewer
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,862 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
austen_books()
## # A tibble: 73,422 × 2
## text book
## * <chr> <fct>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility
## 2 "" Sense & Sensibility
## 3 "by Jane Austen" Sense & Sensibility
## 4 "" Sense & Sensibility
## 5 "(1811)" Sense & Sensibility
## 6 "" Sense & Sensibility
## 7 "" Sense & Sensibility
## 8 "" Sense & Sensibility
## 9 "" Sense & Sensibility
## 10 "CHAPTER 1" Sense & Sensibility
## # … with 73,412 more rows
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
nrc_joy
## # A tibble: 687 × 2
## word sentiment
## <chr> <chr>
## 1 absolution joy
## 2 abundance joy
## 3 abundant joy
## 4 accolade joy
## 5 accompaniment joy
## 6 accomplish joy
## 7 accomplished joy
## 8 achieve joy
## 9 achievement joy
## 10 acrobat joy
## # … with 677 more rows
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
## # A tibble: 920 × 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
## 7 Sense & Sensibility 6 24 40 16
## 8 Sense & Sensibility 7 23 51 28
## 9 Sense & Sensibility 8 30 40 10
## 10 Sense & Sensibility 9 15 19 4
## # … with 910 more rows
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
## 2.2 Comparing the three sentiment dictionaries
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
afinn
## # A tibble: 163 × 3
## index sentiment method
## <dbl> <dbl> <chr>
## 1 0 29 AFINN
## 2 1 0 AFINN
## 3 2 20 AFINN
## 4 3 30 AFINN
## 5 4 62 AFINN
## 6 5 66 AFINN
## 7 6 60 AFINN
## 8 7 18 AFINN
## 9 8 84 AFINN
## 10 9 26 AFINN
## # … with 153 more rows
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bing_and_nrc
## # A tibble: 326 × 5
## method index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 Bing et al. 0 7 21 14
## 2 Bing et al. 1 20 19 -1
## 3 Bing et al. 2 16 20 4
## 4 Bing et al. 3 19 31 12
## 5 Bing et al. 4 23 47 24
## 6 Bing et al. 5 15 49 34
## 7 Bing et al. 6 18 46 28
## 8 Bing et al. 7 23 33 10
## 9 Bing et al. 8 17 48 31
## 10 Bing et al. 9 22 40 18
## # … with 316 more rows
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
I decided to utilize the gutenberg library
library("gutenbergr")
gutenberg_works() %>%
filter(title == "John F. Kennedy's Inaugural Address")
## # A tibble: 1 × 8
## gutenberg_id title author guten…¹ langu…² guten…³ rights has_t…⁴
## <int> <chr> <chr> <int> <chr> <chr> <chr> <lgl>
## 1 3 John F. Kennedy's … Kenne… 1666 en <NA> Publi… TRUE
## # … with abbreviated variable names ¹gutenberg_author_id, ²language,
## # ³gutenberg_bookshelf, ⁴has_text
Inaugural Address
JFK <- gutenberg_download(3)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
text <- tibble(line = 1:nrow(JFK), JFK$text)
colnames(text) <- c('lines', 'text')
clean_book <- text %>%
unnest_tokens(word, text) #splits a columns into tokens
#Positives
clean_book %>%
inner_join(get_sentiments("bing")) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 97 × 2
## word n
## <chr> <int>
## 1 free 5
## 2 freedom 4
## 3 peace 4
## 4 good 3
## 5 poverty 3
## 6 well 3
## 7 assure 2
## 8 best 2
## 9 burden 2
## 10 destruction 2
## # … with 87 more rows
#Positive Wordcloud
clean_book %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
#Negative Word Cloud
clean_book %>%
inner_join(get_sentiments("nrc")) %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): revolution could not be fit on
## page. It will not be plotted.
bing_word_counts <- clean_book %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE)
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot(aes(reorder(word, n), n, fill = sentiment)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip()
## Selecting by n
# Conclusion
John F Kennedy’s Inaugural Address was one that promoted and fostered peace and freedom within the nature. He ensured to express his concern for the well being of the nation’s constituents. Conversely, he also alluded to the ideas of poverty and tyranny. From this sentiment analysis my personal elicitation without reading the actual article is that JFK was hoping to inspire hope for the future to come of the USA.