library(janeaustenr)
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ngram)
library(corpus)
library(lexicon)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(formattable)
library(knitr)
library(gutenbergr)
a <- get_sentiments("afinn")
head(a, 5)
## # A tibble: 5 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
b <- get_sentiments("bing")
head(b,5)
## # A tibble: 5 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
c <- get_sentiments("nrc")
head(c,5)
## # A tibble: 5 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
Removing the word “chapter”
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
What are the most common joy words in \(Emma\)?
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == 'joy')
words_of_joy <- tidy_books %>% filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
head(words_of_joy, 7)
## # A tibble: 7 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
head(bing_word_counts, 7)
## # A tibble: 7 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("SMART")),
stop_words)
head(custom_stop_words, 7)
## # A tibble: 7 x 2
## word lexicon
## <chr> <chr>
## 1 miss SMART
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
Arranging the word cloud ========================
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("coral4", "goldenrod2"))
## Joining, by = "word"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
Question: Comparing quantities between positive and negative words, which Jane Austen novel seems more likely to be uplifting; “Mansfield Park”, or “Persuasion”?
mans <- tidy_books %>%
filter(book == "Mansfield Park")
per <- tidy_books %>%
filter(book == "Persuasion")
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
positive_words <- bing_word_counts %>%
filter(sentiment == "positive")
head(positive_words, 3)
## # A tibble: 3 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 1523
## 2 good positive 1380
## 3 great positive 981
negative_words <- bing_word_counts %>%
filter(sentiment == "negative")
head(negative_words, 3)
## # A tibble: 3 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 poor negative 424
## 3 doubt negative 281
positive_mans <- positive_words %>% inner_join(mans, by = "word")
positive_mans_gram <- ngram(positive_mans$word, n = 1)
print(positive_mans_gram, output="truncated")
## admiring | 2
## NULL {2} |
##
## hug | 1
## NULL {1} |
##
## grandeur | 5
## NULL {5} |
##
## resolute | 1
## NULL {1} |
##
## humour | 25
## NULL {25} |
##
## [[ ... results truncated ... ]]
print(positive_mans_gram, output="summary")
## An ngram object with 690 1-grams
negative_mans <- negative_words %>% inner_join(mans, by = "word")
negative_mans_gram <- ngram(negative_mans$word, n = 1)
print(negative_mans_gram, output="truncated")
## delayed | 6
## NULL {6} |
##
## vexation | 14
## NULL {14} |
##
## bewildered | 7
## NULL {7} |
##
## unexpectedly | 4
## NULL {4} |
##
## shocked | 8
## NULL {8} |
##
## [[ ... results truncated ... ]]
print(negative_mans_gram, output="summary")
## An ngram object with 980 1-grams
positive_persuasion <- positive_words %>% inner_join(per, by = "word")
positive_persuasion <- ngram(positive_persuasion$word, n = 1)
print(positive_persuasion, output="truncated")
## admiring | 3
## NULL {3} |
##
## grandeur | 3
## NULL {3} |
##
## resolute | 3
## NULL {3} |
##
## humour | 10
## NULL {10} |
##
## willingly | 2
## NULL {2} |
##
## [[ ... results truncated ... ]]
print(positive_persuasion, output="summary")
## An ngram object with 530 1-grams
negative_persuasion <- negative_words %>% inner_join(per, by = "word")
negative_persuasion <- ngram(negative_persuasion$word, n = 1)
print(negative_persuasion, output="truncated")
## delayed | 1
## NULL {1} |
##
## vexation | 2
## NULL {2} |
##
## strenuous | 1
## NULL {1} |
##
## unexpectedly | 1
## NULL {1} |
##
## shocked | 5
## NULL {5} |
##
## [[ ... results truncated ... ]]
print(negative_persuasion, output="summary")
## An ngram object with 700 1-grams
Alice_in_Wonderland <- gutenberg_download(28885)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Alice_in_wonderland <- Alice_in_Wonderland %>% select(-gutenberg_id)
head(Alice_in_wonderland, 100)
## # A tibble: 100 x 1
## text
## <chr>
## 1 "ALICE'S ADVENTURES IN WONDERLAND"
## 2 ""
## 3 "[Illustration: \"Alice\"]"
## 4 ""
## 5 "[Illustration:"
## 6 ""
## 7 " ALICE'S.ADVENTURES"
## 8 " IN.WONDERLAND"
## 9 " BY.LEWIS.CARROLL"
## 10 " ILLUSTRATED.BY"
## # ... with 90 more rows
positive_words_4_Alice <- positive_words %>%
filter(sentiment == "positive")
head(positive_words_4_Alice, 3)
## # A tibble: 3 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 1523
## 2 good positive 1380
## 3 great positive 981
positive_words_4_Alice %>%
select(word, sentiment, n) %>%
arrange() %>% top_n(22, n) %>%
mutate(word = reorder(word,n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_classic() +
labs(x = "+ +", y = "Frequency",
title = "Positive words: Alice in Wonderland")
negative_words_4_Alice <- negative_words %>%
filter(sentiment == "negative")
head(negative_words_4_Alice, 3)
## # A tibble: 3 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 poor negative 424
## 3 doubt negative 281
negative_words_4_Alice %>%
select(word, sentiment, n) %>%
arrange() %>% top_n(22, n) %>%
mutate(word = reorder(word,n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_classic() +
labs(x = "__ __ ", y = "Frequency",
title = "Negative words: Alice in Wonderland")
From the analysis performed and without having looked at any Jane Austen publications, one might gather that “Mansfield Park” has a much more negative tone than “Persuasion”. One may also gather that the story of “Alison in Wonderland” is an uplifting story, with mostly feelings of being content.
It’s evident that stop words can certainly interfere with analysis. However, an argument can be made that not all stop words are necessarily negative, nor have equal weights–it’s all contextual within the literature. To get a more accurate model, analyzing surrounding words of the corpus via Natural Language Processing would enhance accuracy of the analysis.
https://stackoverflow.com/questions/37291984/find-the-most-frequently-occuring-words-in-a-text-in-r
https://www.rdocumentation.org/packages/gutenbergr/versions/0.2.0/topics/gutenberg_download
(inspiration) https://medium.com/analytics-vidhya/different-ways-of-visualizing-twitter-sentiments-analysis-in-r-270d5d459603
https://frex1.github.io/twitter-sentiments/tweets
https://www.rdocumentation.org/packages/formattable/versions/0.2.1/topics/color_tile
(What worked best) https://towardsdatascience.com/twitter-sentiment-analysis-and-visualization-using-r-22e1f70f6967