This assignment is focused on reproducing a sentiment analysis from Chapter 2 in Text Mining with R by Julia Silge and David Robinson: https://www.tidytextmining.com/sentiment.html The project is extended further by also analyzing the novel Les Miserables.
Loading libraries and sentiments
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(tidytext)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
Tidy-ing the data within Jane Austen books. Each row includes each word found in each of Austen’s books, along with the chapter it is found in.
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case=TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # … with 725,045 more rows
#the arguments used in the unnest_tokens() funtion is relabeling the text column as "word"
Using the sentiment lexicon NRC, the sentiment “joy” was filtered from Austen’s book “Emma”, where each word associated with the emotion is filtered out and counted in its frequency throughout the book.
#nrc
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
In analyzing how sentiment changes across Austen’s books, the Bing lexicon is used to analyze every 80 lines of each book in the data set.
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
#the count function is used to determine the amount of positive and negative sentiments per every 80 lines
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0 ) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
## # A tibble: 920 × 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
## 7 Sense & Sensibility 6 24 40 16
## 8 Sense & Sensibility 7 23 51 28
## 9 Sense & Sensibility 8 30 40 10
## 10 Sense & Sensibility 9 15 19 4
## # … with 910 more rows
Visualization of the changes in sentiment across each of Austen’s books.
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol=2, scales = "free_x")
The following 3 chunks looks into how the sentiment of the novel “Pride
and Prejudice” differs according to what lexicon is used.
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
afinn
## # A tibble: 163 × 3
## index sentiment method
## <dbl> <dbl> <chr>
## 1 0 29 AFINN
## 2 1 0 AFINN
## 3 2 20 AFINN
## 4 3 30 AFINN
## 5 4 62 AFINN
## 6 5 66 AFINN
## 7 6 60 AFINN
## 8 7 18 AFINN
## 9 8 84 AFINN
## 10 9 26 AFINN
## # … with 153 more rows
bing_and_nrc <- bind_rows(pride_prejudice %>% inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bing_and_nrc
## # A tibble: 326 × 5
## method index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 Bing et al. 0 7 21 14
## 2 Bing et al. 1 20 19 -1
## 3 Bing et al. 2 16 20 4
## 4 Bing et al. 3 19 31 12
## 5 Bing et al. 4 23 47 24
## 6 Bing et al. 5 15 49 34
## 7 Bing et al. 6 18 46 28
## 8 Bing et al. 7 23 33 10
## 9 Bing et al. 8 17 48 31
## 10 Bing et al. 9 22 40 18
## # … with 316 more rows
bind_rows(afinn, bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Looking into the positive and negative of both Bing and NRC
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Retrieving the most frequently used sentimental words.
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
Visualizing the top 10 most frequently used negative and positive words use in “Pride and Prejudice”.
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
customizing stop words. The word “miss” is used in the novel to describe young, unmarried women, not to long for someone/something.
custom_stop_words <-
bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
Creating a word cloud.
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
comparing the most frequent positive and negative words via word
cloud.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
Working with sentences…
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
…and chapters
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
Reviewing the chapters with containing the most negative sentimental words.
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## Selecting by ratio
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
The rest of this assignment extends past the reproduced project to now including a sentiment analysis of the novel Les Miserables
The gutenbergr package is used to retrieve the novel where it is then tidyd to a workable dataset.
library(gutenbergr)
vhugo <- gutenberg_download(135)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/3/135/135.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
## Warning: Unknown or uninitialised column: `text`.
les_mis <- vhugo %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
select(-gutenberg_id)
## Joining, by = "word"
les_mis
## # A tibble: 0 × 3
## # … with 3 variables: linenumber <int>, chapter <int>, word <chr>
Retrieving the sentiment score for each chapter
les_mis_sentiment <- les_mis %>%
inner_join(get_sentiments("afinn")) %>%
group_by(chapter) %>%
summarise(sentiment = sum(value))
## Joining, by = "word"
les_mis_sentiment
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>
Visualizing how the sentiment scores changes across the 365 chapters within Les Miserables
ggplot(les_mis_sentiment, aes(x = chapter, y = sentiment)) +
geom_col() +
labs(title = "Sentiment Analysis of Les Misérables",
x = "Chapter ",
y = "Sentiment Score") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
The 10 most positive chapters based on their sentiment scores
les_mis_sentiment %>%
arrange(desc(sentiment)) %>%
head(10)
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>
Analyzing the most positive words of the most positive chapter
lesmis_pos_sentiment <- les_mis %>%
filter(chapter == 351) %>%
inner_join(get_sentiments("afinn")) %>%
arrange(desc(value)) %>%
head(10)
## Joining, by = "word"
lesmis_pos_sentiment
## # A tibble: 0 × 4
## # … with 4 variables: linenumber <int>, chapter <int>, word <chr>, value <dbl>
Visualizing
lesmis_pos_sentiment_viz <- lesmis_pos_sentiment %>%
ggplot(aes(x = word, y = value)) +
geom_col( fill= "#04D6D9") +
labs(title = "Analysis of the 10 most Positive Words from the most Positive Chapter in Les Miserables",
x = "Positive Words ",
y = "Sentiment Score") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
The 10 most negative chapters based on their sentiment scores
les_mis_sentiment %>%
arrange(desc(sentiment)) %>%
tail(10)
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>
Analyzing the most positive words of the most positive chapter
lesmis_neg_sentiment <- les_mis %>%
filter(chapter == 220) %>%
inner_join(get_sentiments("afinn")) %>%
arrange(desc(value)) %>%
tail(10)
## Joining, by = "word"
lesmis_neg_sentiment
## # A tibble: 0 × 4
## # … with 4 variables: linenumber <int>, chapter <int>, word <chr>, value <dbl>
Visualization
lesmis_neg_sentiment_viz <- lesmis_neg_sentiment %>%
ggplot(aes(x = word, y = value)) +
geom_col( fill= "#D92104") +
labs(title = "Analysis of the 10 most Negative Words in the Most Negative Chapter in Les Miserables",
x = "Negative Words ",
y = "Sentiment Score") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_reverse() +
coord_flip()
lesmis_pos_sentiment_viz
lesmis_neg_sentiment_viz
The analysis of Les Miserables identifies the ten most positive and negative chapters of the novel.The 10 most positive words in the most positive chapter(351) suggest a theme of success and happiness while the 10 most negative words in the most negative chapter (220) suggests themes of oppression, suffering, and injustice.