library(janeaustenr)
library(dplyr)
library(stringr)
#for unnest_tokens() function
library(tidytext)
#for get_sentiments() function
library(textdata)
# pivot_wider() function
library(tidyr)
#ggplot function
library(ggplot2)
#to plot wordcloud
library(wordcloud)
# to cast to dataframe
library(reshape2)
#lemmatize_words
library(textstem)
#corpus function
library(quanteda)
# read data from austen books, create new variables line number and chapter
# convert the text to the tidy format using unnest_tokens()
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
# read sentiments from nrc with joy sentiment
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
#text from the book "from Emma"
#join with our data to get distinct counts
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
#count up how many positive and negative words there are in defined sections of 80 lines of text of each book
#
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
# plot the sentiment scores across the plot trajectory of each novel
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
AFINN info: URL: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
License: Open Database License (ODbL) v1.0
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
#bind them together and visualize them
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Summary:
The three different lexicons for calculating sentiment give results that are different in an absolute sense but have similar relative trajectories through the novel. all three agree roughly on the overall trends in the sentiment through a narrative
#each word contribution to each sentiment.
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
# plot visually using ggplot2,
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
#custom stop-words list using bind_rows()
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
#plot using wordcloud
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
#comparison cloud
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
#Tokenizing into sentence
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
#unnesting by chapters in each book
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
#number of chapters in each novel (plus an “extra” row for each novel title)
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
#regex to find where all the chapters were in Austen’s novels for a tidy data frame organized by one-word-per-row
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
#the most negative chapters in each of Jane Austen’s novels
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
Name: Loughran-McDonald Sentiment lexicon URL: https://sraf.nd.edu/textual-analysis/resources/
#loading sentiment from Loughran-McDonald Sentiment lexicon
lou <- get_sentiments("loughran") # 4150 words
lou$word <- lemmatize_words(lou$word)
lou <- lou %>% distinct() # 2456 words
lou_positive_negative <- lou %>%
filter(sentiment %in% c("positive", "negative"))
#combining with crude data in tm package
library(tm)
data(crude)
crude_df <- data.frame(text = sapply(crude, as.character), stringsAsFactors = FALSE)
# join to get sentiment analysis
crude_sentiment <- crude_df %>%
unnest_tokens(word, text) %>%
inner_join(lou_positive_negative) %>%
count(word, sentiment,sort = TRUE)
# words in crude data in positive and negative sentiment
crude_sentiment %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
#find the most common positive and negative words.
crude_sentiment %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)