This is a very simple sentiment analysis of “Anne of Green Gables” by the Canadian author L. M. Montgomery.

Let’s start by loading the necessary packages.

library(tidytext)
library(tidyverse)
library(stringr)
library(reshape2)
library(gutenbergr)
library(wordcloud)
library(reshape2)

Now we can download “Anne of Green Gables” from Gutenberg.

agg <- gutenberg_download(45)
agg
## # A tibble: 10,779 x 2
##    gutenberg_id text                                                    
##           <int> <chr>                                                   
##  1           45 ANNE OF GREEN GABLES                                    
##  2           45 ""                                                      
##  3           45 By Lucy Maud Montgomery                                 
##  4           45 ""                                                      
##  5           45 ""                                                      
##  6           45 ""                                                      
##  7           45 Table of Contents                                       
##  8           45 ""                                                      
##  9           45 "     CHAPTER I          Mrs. Rachel Lynde Is Surprised"
## 10           45 "     CHAPTER II         Matthew Cuthbert Is Surprised" 
## # ... with 10,769 more rows

Before we can continue, we need to turn the tibble into the tidy text format. This means that in the tibble there will be one single word per row and punctuation and stop words will be removed:

tidy_agg <- agg %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
tidy_agg
## # A tibble: 34,186 x 2
##    gutenberg_id word      
##           <int> <chr>     
##  1           45 anne      
##  2           45 green     
##  3           45 gables    
##  4           45 lucy      
##  5           45 maud      
##  6           45 montgomery
##  7           45 table     
##  8           45 contents  
##  9           45 chapter   
## 10           45 rachel    
## # ... with 34,176 more rows

NRC lexicon contains words that are associated with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and the two basic sentiments (positive and negative). Let’s explore the most frequent joy, surprise, fear and sadness words:

#joy words
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_agg_joy <- tidy_agg %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Warning: `chr_along()` is deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
tidy_agg_joy
## # A tibble: 310 x 2
##    word         n
##    <chr>    <int>
##  1 white      142
##  2 green      135
##  3 child       95
##  4 pretty      89
##  5 glad        74
##  6 love        70
##  7 found       68
##  8 lovely      64
##  9 hope        50
## 10 splendid    46
## # ... with 300 more rows
#surprise words:
nrc_surprise <- get_sentiments("nrc") %>%
filter(sentiment == "surprise")

tidy_agg_surprise <- tidy_agg %>%
inner_join(nrc_surprise) %>%
count(word, sort = TRUE)
tidy_agg_surprise
## # A tibble: 186 x 2
##    word          n
##    <chr>     <int>
##  1 lovely       64
##  2 guess        50
##  3 hope         50
##  4 splendid     46
##  5 sweet        38
##  6 picnic       33
##  7 expect       29
##  8 wild         27
##  9 feeling      26
## 10 wonderful    25
## # ... with 176 more rows
#fear words:
nrc_fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
tidy_agg_fear <- tidy_agg %>%
inner_join(nrc_fear) %>%
count(word, sort = TRUE)
tidy_agg_fear
## # A tibble: 354 x 2
##    word         n
##    <chr>    <int>
##  1 bad         41
##  2 afraid      39
##  3 asylum      37
##  4 dreadful    27
##  5 wicked      27
##  6 feeling     26
##  7 terrible    25
##  8 worse       24
##  9 haunted     23
## 10 orphan      23
## # ... with 344 more rows
#sadness words:
nrc_sadness <- get_sentiments("nrc") %>%
  filter(sentiment == "sadness")
tidy_agg_sadness <- tidy_agg %>%
  inner_join(nrc_sadness) %>%
  count(word, sort = TRUE)
tidy_agg_sadness
## # A tibble: 326 x 2
##    word         n
##    <chr>    <int>
##  1 lovely      64
##  2 dark        46
##  3 bad         41
##  4 blue        40
##  5 cry         39
##  6 mother      34
##  7 black       33
##  8 mistake     28
##  9 dreadful    27
## 10 feeling     26
## # ... with 316 more rows

Let’s now have a look at the most common positive and negative words and visualize it with ggplot2, more specifically how those words contribute to the sentiment:

#view words
bing_word_counts <- tidy_agg %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 1,427 x 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 miss      negative    148
##  2 pretty    positive     89
##  3 glad      positive     74
##  4 love      positive     70
##  5 lovely    positive     64
##  6 nice      positive     64
##  7 hard      negative     63
##  8 dark      negative     46
##  9 splendid  positive     46
## 10 beautiful positive     44
## # ... with 1,417 more rows
#visualize
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() 

The word “miss” can have multiple meanings and is not necessarily a negative word, so it could be added to a list of custom stop words, but at this time we will skip it.

We can also make a wordcloud of the most frequent words limiting the number of words to 100:

tidy_agg %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))

And another wordcloud comparing the most frequent positive and negative words, this time limiting the number of words to 50:

tidy_agg %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("darkorchid4", "darkseagreen4"),
max.words = 50)

We can also see how the sentiment evolves across the whole book as compared with three different dictionaries (afinn, bing and nrc). For this we need to divide the tibble tidy_agg into chunks, each consisting of a number of lines, and we will then visualize the average scores of positive vs negative per each chunk. However, as tidy_agg does not contain line numbers, we’ll use index numbers instead, effectively putting a number of words into each chunk. Let’s start with 80 words per each chunk and see what we get.

#sentiment evolution over 80 words per chunk
tidy_agg <- tibble::rowid_to_column(tidy_agg, "index")

afinn80 <- tidy_agg %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = index %/% 80) %>%
  summarise(sentiment = sum(score)) %>%
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(tidy_agg %>%
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_agg %>%
                            inner_join(get_sentiments("nrc") %>%
                                         filter(sentiment %in% c("positive",
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = index %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
bind_rows(afinn80,
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Evolution of sentiment in Anne of Green Gables", y = "Sentiment score", x = "Chunks")

This looks very concentrated. Here we have one line per word and more than 34 thousand of them, so in order to have a better overview the intervals should be a lot bigger. Let’s try dividing the tibble into chunks of 200 words in each and see what happens!

#sentiment evolution over 200 words per chunk
afinn200 <- tidy_agg %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = index %/% 200) %>%
  summarise(sentiment = sum(score)) %>%
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(tidy_agg %>%
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_agg %>%
                            inner_join(get_sentiments("nrc") %>%
                                         filter(sentiment %in% c("positive",
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = index %/% 200, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
bind_rows(afinn200,
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Evolution of sentiment in Anne of Green Gables", y = "Sentiment score", x = "Chunks")

Much better! We can already see that according to the NRC lexicon, “Anne of Green Gables” is indeed quite a happy book! The AFINN lexicon shows a greater variations in terms of positive vs negative words comparison, while the Bing lexicon appears to have a greater overall number of words identified as negative. Whichever the differences, there are at least two things that are consistent for all three lexicons: the highs and lows appear practically in the same places and the book gets happier (or less sad) as the story develops. The AFINN lexicon shows three very high spikes where the sentiment scores 50 or higher. Let’s find out what happened in the book!

afinn200 %>%
  left_join(tidy_agg, by = "index") %>%
  select(index, sentiment) %>%
  filter(sentiment >= 50) 
## # A tibble: 3 x 2
##   index sentiment
##   <dbl>     <int>
## 1    20        52
## 2    93        51
## 3   159        50

According to the AFINN lexicon the three happiest chunks are numbered 20, 93 and 159. If chunk 1 contains indexes 1:200 then we can calculate the index ranges as follows:

#display top and bottom 20 words from chunk 20 using the headTail() function from the psych package
library(psych)
chunk20 <- tidy_agg %>%
  filter(index %in% 3801:4000) %>% #1st index: 4000-199 = 3801, last index: 20*200 = 4000
  select(word) %>%
  headTail(top = 20, bottom = 20)
chunk20
##           word
## 1        amuse
## 2       dinner
## 3         time
## 4         anne
## 5         flew
## 6         door
## 7       alight
## 8         eyes
## 9      glowing
## 10   threshold
## 11     stopped
## 12       short
## 13     wheeled
## 14         sat
## 15       table
## 16       light
## 17        glow
## 18 effectually
## 19     blotted
## 20     clapped
## 21        <NA>
## 22      breath
## 23 aggravating
## 24        talk
## 25       woman
## 26     matthew
## 27     hitched
## 28      sorrel
## 29       buggy
## 30         due
## 31        time
## 32     marilla
## 33        anne
## 34         set
## 35     matthew
## 36        yard
## 37        gate
## 38       drove
## 39      slowly
## 40       jerry
## 41       buote

This chunk has a sentiment score of 52 making it the most positive 200-word chunk in the whole book and it refers to a conversation between Anne and Marilla on the morning following Anne’s arrival at Green Gables:

Anne washed the dishes deftly enough, as Marilla who kept a sharp eye on the process, discerned. Later on she made her bed less successfully, for she had never learned the art of wrestling with a feather tick. But is was done somehow and smoothed down; and then Marilla, to get rid of her, told her she might go out-of-doors and amuse herself until dinner time. Anne flew to the door, face alight, eyes glowing. On the very threshold she stopped short, wheeled about, came back and sat down by the table, light and glow as effectually blotted out as if some one had clapped an extinguisher on her. “What’s the matter now?” demanded Marilla. “I don’t dare go out,” said Anne, in the tone of a martyr relinquishing all earthly joys. “If I can’t stay here there is no use in my loving Green Gables. And if I go out there and get acquainted with all those trees and flowers and the orchard and the brook I’ll not be able to help loving it. It’s hard enough now, so I won’t make it any harder. I want to go out so much—everything seems to be calling to me, ‘Anne, Anne, come out to us. Anne, Anne, we want a playmate’—but it’s better not. There is no use in loving things if you have to be torn from them, is there? And it’s so hard to keep from loving things, isn’t it? That was why I was so glad when I thought I was going to live here. I thought I’d have so many things to love and nothing to hinder me. But that brief dream is over. I am resigned to my fate now, so I don’t think I’ll go out for fear I’ll get unresigned again. What is the name of that geranium on the window-sill, please?” “That’s the apple-scented geranium.” “Oh, I don’t mean that sort of a name. I mean just a name you gave it yourself. Didn’t you give it a name? May I give it one then? May I call it—let me see—Bonny would do—may I call it Bonny while I’m here? Oh, do let me!” “Goodness, I don’t care. But where on earth is the sense of naming a geranium?” “Oh, I like things to have handles even if they are only geraniums. It makes them seem more like people. How do you know but that it hurts a geranium’s feelings just to be called a geranium and nothing else? You wouldn’t like to be called nothing but a woman all the time. Yes, I shall call it Bonny. I named that cherry-tree outside my bedroom window this morning. I called it Snow Queen because it was so white. Of course, it won’t always be in blossom, but one can imagine that it is, can’t one?” “I never in all my life saw or heard anything to equal her,” muttered Marilla, beating a retreat down to the cellar after potatoes. “She is kind of interesting as Matthew says. I can feel already that I’m wondering what on earth she’ll say next. She’ll be casting a spell over me, too. She’s cast it over Matthew. That look he gave me when he went out said everything he said or hinted last night over again. I wish he was like other men and would talk things out. A body could answer back then and argue him into reason. But what’s to be done with a man who just looks?” Anne had relapsed into reverie, with her chin in her hands and her eyes on the sky, when Marilla returned from her cellar pilgrimage. There Marilla left her until the early dinner was on the table. “I suppose I can have the mare and buggy this afternoon, Matthew?” said Marilla. Matthew nodded and looked wistfully at Anne. Marilla intercepted the look and said grimly: “I’m going to drive over to White Sands and settle this thing. I’ll take Anne with me and Mrs. Spencer will probably make arrangements to send her back to Nova Scotia at once. I’ll set your tea out for you and I’ll be home in time to milk the cows.” Still Matthew said nothing and Marilla had a sense of having wasted words and breath. There is nothing more aggravating than a man who won’t talk back—unless it is a woman who won’t. Matthew hitched the sorrel into the buggy in due time and Marilla and Anne set off. Matthew opened the yard gate for them and as they drove slowly through, he said, to nobody in particular as it seemed: “Little Jerry Buote from the Creek was here this morning, and I told him I guessed I’d hire him for the summer.” Marilla made no reply, but she hit the unlucky sorrel such a vicious clip with the whip that the fat mare, unused to such treatment, whizzed indignantly down the lane at an alarming pace. Marilla looked back once as the buggy bounced along and saw that aggravating Matthew leaning over the gate, looking wistfully after them.

#display top and bottom 20 words from chunk 93
chunk93 <- tidy_agg %>%
  filter(index %in% 18401:18600) %>%
  select(word) %>%
  headTail(top = 20, bottom = 20)
chunk93
##            word
## 1       sleeves
## 2    minister's
## 3          wife
## 4    allowances
## 5         board
## 6         lynde
## 7         manse
## 8         ready
## 9       marilla
## 10      lynde's
## 11      evening
## 12     actuated
## 13       motive
## 14         save
## 15       avowed
## 16    returning
## 17     quilting
## 18       frames
## 19     borrowed
## 20    preceding
## 21         <NA>
## 22        sound
## 23   questioned
## 24     doctrine
## 25       wife's
## 26       people
## 27  respectable
## 28        women
## 29 housekeepers
## 30        lynde
## 31        sound
## 32     doctrine
## 33 housekeeping
## 34        woman
## 35        ideal
## 36  combination
## 37   minister's
## 38       family
## 39     minister
## 40         wife
## 41     pleasant

This is the second most positive chunk:

“/…/ Jane Andrews said she thought puffed sleeves were too worldly for a minister’s wife, but I didn’t make any such uncharitable remark, Marilla, because I know what it is to long for puffed sleeves. Besides, she’s only been a minister’s wife for a little while, so one should make allowances, shouldn’t they? They are going to board with Mrs. Lynde until the manse is ready.” If Marilla, in going down to Mrs. Lynde’s that evening, was actuated by any motive save her avowed one of returning the quilting frames she had borrowed the preceding winter, it was an amiable weakness shared by most of the Avonlea people. Many a thing Mrs. Lynde had lent, sometimes never expecting to see it again, came home that night in charge of the borrowers thereof. A new minister, and moreover a minister with a wife, was a lawful object of curiosity in a quiet little country settlement where sensations were few and far between. Old Mr. Bentley, the minister whom Anne had found lacking in imagination, had been pastor of Avonlea for eighteen years. He was a widower when he came, and a widower he remained, despite the fact that gossip regularly married him to this, that, or the other one, every year of his sojourn. In the preceding February he had resigned his charge and departed amid the regrets of his people, most of whom had the affection born of long intercourse for their good old minister in spite of his shortcomings as an orator. Since then the Avonlea church had enjoyed a variety of religious dissipation in listening to the many and various candidates and “supplies” who came Sunday after Sunday to preach on trial. These stood or fell by the judgment of the fathers and mothers in Israel; but a certain small, red-haired girl who sat meekly in the corner of the old Cuthbert pew also had her opinions about them and discussed the same in full with Matthew, Marilla always declining from principle to criticize ministers in any shape or form. “I don’t think Mr. Smith would have done, Matthew” was Anne’s final summing up. “Mrs. Lynde says his delivery was so poor, but I think his worst fault was just like Mr. Bentley’s—he had no imagination. And Mr. Terry had too much; he let it run away with him just as I did mine in the matter of the Haunted Wood. Besides, Mrs. Lynde says his theology wasn’t sound. Mr. Gresham was a very good man and a very religious man, but he told too many funny stories and made the people laugh in church; he was undignified, and you must have some dignity about a minister, mustn’t you, Matthew? I thought Mr. Marshall was decidedly attractive; but Mrs. Lynde says he isn’t married, or even engaged, because she made special inquiries about him, and she says it would never do to have a young unmarried minister in Avonlea, because he might marry in the congregation and that would make trouble. Mrs. Lynde is a very farseeing woman, isn’t she, Matthew? I’m very glad they’ve called Mr. Allan. I liked him because his sermon was interesting and he prayed as if he meant it and not just as if he did it because he was in the habit of it. Mrs. Lynde says he isn’t perfect, but she says she supposes we couldn’t expect a perfect minister for seven hundred and fifty dollars a year, and anyhow his theology is sound because she questioned him thoroughly on all the points of doctrine. And she knows his wife’s people and they are most respectable and the women are all good housekeepers. Mrs. Lynde says that sound doctrine in the man and good housekeeping in the woman make an ideal combination for a minister’s family.” The new minister and his wife were a young, pleasant-faced couple, still on their honeymoon, and full of all good and beautiful enthusiasms for their chosen lifework.

#display top and bottom 20 words from chunk 159
chunk159 <- tidy_agg %>%
  filter(index %in% 31601:31800) %>%
  select(word) %>%
  headTail(top = 20, bottom = 20)
chunk159
##             word
## 1       steadily
## 2        rivalry
## 3        gilbert
## 4        intense
## 5        avonlea
## 6         school
## 7          class
## 8     bitterness
## 9           anne
## 10        wished
## 11           win
## 12          sake
## 13     defeating
## 14       gilbert
## 15         proud
## 16 consciousness
## 17           won
## 18       victory
## 19        worthy
## 20        foeman
## 21          <NA>
## 22         medal
## 23         emily
## 24          clay
## 25           win
## 26         avery
## 27   scholarship
## 28          feel
## 29         badly
## 30      tomorrow
## 31         josie
## 32       laughed
## 33          anne
## 34      honestly
## 35          feel
## 36       violets
## 37        coming
## 38        purple
## 39        hollow
## 40         green
## 41        gables

And the third most positive chunk:

Anne worked hard and steadily. Her rivalry with Gilbert was as intense as it had ever been in Avonlea school, although it was not known in the class at large, but somehow the bitterness had gone out of it. Anne no longer wished to win for the sake of defeating Gilbert; rather, for the proud consciousness of a well-won victory over a worthy foeman. It would be worth while to win, but she no longer thought life would be insupportable if she did not. In spite of lessons the students found opportunities for pleasant times. Anne spent many of her spare hours at Beechwood and generally ate her Sunday dinners there and went to church with Miss Barry. The latter was, as she admitted, growing old, but her black eyes were not dim nor the vigor of her tongue in the least abated. But she never sharpened the latter on Anne, who continued to be a prime favorite with the critical old lady. “That Anne-girl improves all the time,” she said. “I get tired of other girls—there is such a provoking and eternal sameness about them. Anne has as many shades as a rainbow and every shade is the prettiest while it lasts. I don’t know that she is as amusing as she was when she was a child, but she makes me love her and I like people who make me love them. It saves me so much trouble in making myself love them.” Then, almost before anybody realized it, spring had come; out in Avonlea the Mayflowers were peeping pinkly out on the sere barrens where snow-wreaths lingered; and the “mist of green” was on the woods and in the valleys. But in Charlottetown harassed Queen’s students thought and talked only of examinations. “It doesn’t seem possible that the term is nearly over,” said Anne. “Why, last fall it seemed so long to look forward to—a whole winter of studies and classes. And here we are, with the exams looming up next week. Girls, sometimes I feel as if those exams meant everything, but when I look at the big buds swelling on those chestnut trees and the misty blue air at the end of the streets they don’t seem half so important.” Jane and Ruby and Josie, who had dropped in, did not take this view of it. To them the coming examinations were constantly very important indeed—far more important than chestnut buds or Maytime hazes. It was all very well for Anne, who was sure of passing at least, to have her moments of belittling them, but when your whole future depended on them—as the girls truly thought theirs did—you could not regard them philosophically. “I’ve lost seven pounds in the last two weeks,” sighed Jane. “It’s no use to say don’t worry. I will worry. Worrying helps you some—it seems as if you were doing something when you’re worrying. It would be dreadful if I failed to get my license after going to Queen’s all winter and spending so much money.” “I don’t care,” said Josie Pye. “If I don’t pass this year I’m coming back next. My father can afford to send me. Anne, Frank Stockley says that Professor Tremaine said Gilbert Blythe was sure to get the medal and that Emily Clay would likely win the Avery scholarship.” “That may make me feel badly tomorrow, Josie,” laughed Anne, “but just now I honestly feel that as long as I know the violets are coming out all purple down in the hollow below Green Gables and that little ferns are poking their heads up in Lovers’ Lane, it’s not a great deal of difference whether I win the Avery or not. I’ve done my best and I begin to understand what is meant by the ‘joy of the strife.’ Next to trying and winning, the best thing is trying and failing. Girls, don’t talk about exams! Look at that arch of pale green sky over those houses and picture to yourself what it must look like over the purply-dark beech-woods back of Avonlea.”

Finally, let’s find out how frequently each character is mentioned in the book based. Some characters are often mentioned just with their first name or their fill name, whereas others tend to be referred to with just their surname, and some characters share the name surname. It would not make sense to search for mentions of a character with their first name, surname and also full name, so I am doing this based on a vector that contains the main characters’ first names and surnames. Considering the structure of the tidy tibble, this should find all instances where a character is mentioned, regardless of whether it’s with their first name, surname or both. In the second case we might not always be sure which character we are dealing with, but adding Miss, Mr or Mrs to the vector would not be very helpful, given that all words are separate anyway.

main_char <- c("anne", "shirley", "marilla", "matthew", "cuthbert", "diana", "minnie may", "barry", "gilbert", "blythe", "rachel", "lynde", "muriel", "stacy", "allan", "josephine", "ruby", "gillis", "jane", "prissy", "andrews", "josie", "pye", "phillips", "charlie", "sloane", "hammond", "christine", "stuart")
tidy_agg %>% 
  select(word) %>%
  filter(word %in% main_char) %>% 
  group_by(word) %>% 
  rename(character = word) %>%
  mutate(mentions = n()) %>%
  unique() %>%
  ggplot(aes(reorder(character, mentions), mentions, fill = factor(character))) +
  geom_col() + 
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "", y = "mentions", title = "Character mentions") +
  coord_flip()

tidy_agg %>% 
  select(word) %>%
  filter(word %in% main_char) %>% 
  group_by(word) %>% 
  count() %>%
  with(wordcloud(word, n, max.words = 100))