##Packages
library(tidyverse)
library(stringr)
library(tidytext)
library(janeaustenr)
library(textdata)
library(gutenbergr)
##Example from book:
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
Source:http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip
Citation Info: Version: 0.92 Publicly Released: 10 July 2011 Created By: Dr. Saif M. Mohammad, Dr. Peter Turney Home Page: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
#Using NRC
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
##Using Bing
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
##Austen Sentiment plot
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
##Comparison of the sentiment dictionarie
Entirety of P&P book text
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
##AFINN, BING an NRC sentiments
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
Plot to compare
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
The three sentiment dictionaries show overall that there is similar
positive and negative trajectory throughout Austen’s Pride and Predudice
novel. Upon further examination however, NRC is lacking in
representiaion of negative values, while AFINN and Bing both represent
negative better. This is likely due to the structure of NRC compared to
the others and how is defines negative and positives.
##Examining a separate text
1952, 60481, 3015 Gutenburg Book Codes.
I will be using Project Gutenburg and its R package to examine sentiment of a different author, Charlotte Perkins Gilman.
(afinn <- get_sentiments("afinn"))
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
(bing <- get_sentiments("bing"))
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
(nrc <- get_sentiments("nrc"))
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
unique(nrc$sentiment)
## [1] "trust" "fear" "negative" "sadness" "anger"
## [6] "surprise" "positive" "disgust" "joy" "anticipation"
Iteration for loop to get the correct names replaced for analysis down the road.
C_P_G <- gutenberg_download(c(1952, 60481, 3015))
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
replace_values <- c(3015, 60481, 1952)
replacement_names <- c("The Man-Made World", "In this our world", "The Yellow Wallpaper")
#Need to change the names for a cross analysis later
for (i in seq_along(replace_values)) {
C_P_G$gutenberg_id[C_P_G$gutenberg_id == replace_values[i]] <- replacement_names[i]
}
tidy_CPG <- C_P_G %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
Most common words?
The Man-Made World - 3015 In this our world - 60481 The Yellow Wallpaper - 1952
tidy_CPG %>%
count(word, sort = T)
## # A tibble: 8,256 × 2
## word n
## <chr> <int>
## 1 women 400
## 2 human 353
## 3 life 286
## 4 love 225
## 5 world 213
## 6 day 137
## 7 masculine 136
## 8 woman 136
## 9 male 134
## 10 social 124
## # ℹ 8,246 more rows
tidy_CPG <- C_P_G %>%
rename("book" = "gutenberg_id" ) %>%
group_by(book) %>%
mutate(
linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, text)
CPG_sentiment <- tidy_CPG %>%
inner_join(bing) %>%
count(book, index = linenumber %/% 30, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
ggplot(CPG_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
We looked at anger how about sadness? Lets visualize sadness like we did
for anger. But try using Afinn instead of NRC:
afinn_sad <- tidy_CPG %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 10) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
ggplot(afinn_sad, aes(index, sentiment, fill = "method"))+
geom_col(show.legend = F) +
facet_wrap(~method, scales = "free_y")
Does this change with NRC and Bing?
sadness_nrc <- nrc %>%
filter(sentiment == "sadness")
tidy_CPG %>%
inner_join(sadness_nrc) %>%
count(word, sort = T)
## Joining with `by = join_by(word)`
## # A tibble: 406 × 2
## word n
## <chr> <int>
## 1 mother 109
## 2 art 82
## 3 pain 59
## 4 death 40
## 5 sin 35
## 6 die 34
## 7 warfare 32
## 8 blue 27
## 9 shame 24
## 10 struggle 24
## # ℹ 396 more rows
CPG_sent <- tidy_CPG %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 10, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
ggplot(CPG_sent, aes(index, sentiment, fill = book))+
geom_col(show.legend = F) +
facet_wrap(~book, scales = "free_y", nrow = 2 )
##Conclusion: The data above further highlights the length difference
and how it can effect visualization and understanding of these text
mining packages. Overall, the first iteration of using Bing has provided
the best look into the three books from Charlotte Gilman. I wanted to
use Gilmans works because The Wallpaper was one of the darkest things
ive read, and I wanted to see if these lexicons could pick up on that.
Also see if there were any significant trends with her other work. While
it isnt apparent in her other work, the analysis definitely shows that
her Yellow Wallpaper short story is much darker than the others.