library(janeaustenr)
library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(ngram)
library(corpus)
library(lexicon)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(formattable)
library(knitr)
library(gutenbergr)

## Reproducing code from Ch. 2

a <- get_sentiments("afinn")
head(a, 5)

## # A tibble: 5 x 2
##   word      value
##   <chr>     <dbl>
## 1 abandon      -2
## 2 abandoned    -2
## 3 abandons     -2
## 4 abducted     -2
## 5 abduction    -2

b <- get_sentiments("bing")
head(b,5)

## # A tibble: 5 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative

c <- get_sentiments("nrc")
head(c,5)

## # A tibble: 5 x 2
##   word      sentiment
##   <chr>     <chr>    
## 1 abacus    trust    
## 2 abandon   fear     
## 3 abandon   negative 
## 4 abandon   sadness  
## 5 abandoned anger

## Sentiment Analysis with Inner Join

Removing the word “chapter”

tidy_books <- austen_books() %>% 
                             group_by(book) %>%
                             mutate(
                               linenumber = row_number(),
                               chapter = cumsum(str_detect(text,
                                                           regex("^chapter [\\divxlc]",
                                                                 ignore_case = TRUE)))) %>%
                             ungroup() %>%
                             unnest_tokens(word, text)

What are the most common joy words in \(Emma\)?

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == 'joy')

words_of_joy <- tidy_books %>% filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

head(words_of_joy, 7)

## # A tibble: 7 x 2
##   word       n
##   <chr>  <int>
## 1 good     359
## 2 young    192
## 3 friend   166
## 4 hope     143
## 5 happy    125
## 6 love     117
## 7 deal      92

Comparing the difference of negative and positive sentiment

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"

Graphing sentiment scores

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing differences between all 3 sentiments

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows

Defining a broader area of text–spans multiple lines for

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"

## Joining, by = "word"

Binding and visualizing the sentiment differences

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Amount of positive and negative words

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

head(bing_word_counts, 7)

## # A tibble: 7 x 3
##   word   sentiment     n
##   <chr>  <chr>     <int>
## 1 miss   negative   1855
## 2 well   positive   1523
## 3 good   positive   1380
## 4 great  positive    981
## 5 like   positive    725
## 6 better positive    639
## 7 enough positive    613

Visualizing positive and negative word counts

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Designating “miss” as a “stop word”

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("SMART")), 
                               stop_words)

head(custom_stop_words, 7)

## # A tibble: 7 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 miss      SMART  
## 2 a         SMART  
## 3 a's       SMART  
## 4 able      SMART  
## 5 about     SMART  
## 6 above     SMART  
## 7 according SMART

Creating a word cloud of stop-words

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

Arranging the word cloud ========================

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("coral4", "goldenrod2"))

## Joining, by = "word"

Tokenizing text into sentences

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

What chapter has the highest amount of negative words?

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` regrouping output by 'book' (override with `.groups` argument)

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining, by = "word"
## `summarise()` regrouping output by 'book' (override with `.groups` argument)

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

New sentiment analyses

Question: Comparing quantities between positive and negative words, which Jane Austen novel seems more likely to be uplifting; “Mansfield Park”, or “Persuasion”?

mans <- tidy_books %>%
  filter(book == "Mansfield Park")

per <- tidy_books %>%
  filter(book == "Persuasion")

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

positive_words <- bing_word_counts %>% 
  filter(sentiment == "positive")

head(positive_words, 3)

## # A tibble: 3 x 3
##   word  sentiment     n
##   <chr> <chr>     <int>
## 1 well  positive   1523
## 2 good  positive   1380
## 3 great positive    981

negative_words <- bing_word_counts %>% 
  filter(sentiment == "negative")

head(negative_words, 3)

## # A tibble: 3 x 3
##   word  sentiment     n
##   <chr> <chr>     <int>
## 1 miss  negative   1855
## 2 poor  negative    424
## 3 doubt negative    281

Cross referencing positive and negative words with Mansfield Park and Persuasion

positive_mans <- positive_words %>% inner_join(mans, by = "word")

positive_mans_gram <- ngram(positive_mans$word, n = 1)

print(positive_mans_gram, output="truncated")

## admiring | 2 
## NULL {2} | 
## 
## hug | 1 
## NULL {1} | 
## 
## grandeur | 5 
## NULL {5} | 
## 
## resolute | 1 
## NULL {1} | 
## 
## humour | 25 
## NULL {25} | 
## 
## [[ ... results truncated ... ]]

print(positive_mans_gram, output="summary")

## An ngram object with 690 1-grams

negative_mans <- negative_words %>% inner_join(mans, by = "word")
negative_mans_gram <- ngram(negative_mans$word, n = 1)

print(negative_mans_gram, output="truncated")

## delayed | 6 
## NULL {6} | 
## 
## vexation | 14 
## NULL {14} | 
## 
## bewildered | 7 
## NULL {7} | 
## 
## unexpectedly | 4 
## NULL {4} | 
## 
## shocked | 8 
## NULL {8} | 
## 
## [[ ... results truncated ... ]]

print(negative_mans_gram, output="summary")

## An ngram object with 980 1-grams

positive_persuasion <- positive_words %>% inner_join(per, by = "word")


positive_persuasion <- ngram(positive_persuasion$word, n = 1)

print(positive_persuasion, output="truncated")

## admiring | 3 
## NULL {3} | 
## 
## grandeur | 3 
## NULL {3} | 
## 
## resolute | 3 
## NULL {3} | 
## 
## humour | 10 
## NULL {10} | 
## 
## willingly | 2 
## NULL {2} | 
## 
## [[ ... results truncated ... ]]

print(positive_persuasion, output="summary")

## An ngram object with 530 1-grams

negative_persuasion <- negative_words %>% inner_join(per, by = "word")

negative_persuasion <- ngram(negative_persuasion$word, n = 1)

print(negative_persuasion, output="truncated")

## delayed | 1 
## NULL {1} | 
## 
## vexation | 2 
## NULL {2} | 
## 
## strenuous | 1 
## NULL {1} | 
## 
## unexpectedly | 1 
## NULL {1} | 
## 
## shocked | 5 
## NULL {5} | 
## 
## [[ ... results truncated ... ]]

print(negative_persuasion, output="summary")

## An ngram object with 700 1-grams

Alice in Wonderland: Positive and Negative Word frequency

Alice_in_Wonderland <- gutenberg_download(28885)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

Alice_in_wonderland <- Alice_in_Wonderland %>% select(-gutenberg_id)
head(Alice_in_wonderland, 100)

## # A tibble: 100 x 1
##    text                              
##    <chr>                             
##  1 "ALICE'S ADVENTURES IN WONDERLAND"
##  2 ""                                
##  3 "[Illustration: \"Alice\"]"       
##  4 ""                                
##  5 "[Illustration:"                  
##  6 ""                                
##  7 "          ALICE'S.ADVENTURES"    
##  8 "          IN.WONDERLAND"         
##  9 "          BY.LEWIS.CARROLL"      
## 10 "          ILLUSTRATED.BY"        
## # ... with 90 more rows

positive_words_4_Alice <- positive_words %>% 
  filter(sentiment == "positive")

head(positive_words_4_Alice, 3)

## # A tibble: 3 x 3
##   word  sentiment     n
##   <chr> <chr>     <int>
## 1 well  positive   1523
## 2 good  positive   1380
## 3 great positive    981

positive_words_4_Alice %>%
  select(word, sentiment, n) %>%
  arrange() %>% top_n(22, n) %>%
  mutate(word = reorder(word,n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() + 
  xlab(NULL) +
  coord_flip() +
  theme_classic() +
  labs(x = "+  +", y = "Frequency", 
       title = "Positive words: Alice in Wonderland")

negative_words_4_Alice <- negative_words %>% 
  filter(sentiment == "negative")

head(negative_words_4_Alice, 3)

## # A tibble: 3 x 3
##   word  sentiment     n
##   <chr> <chr>     <int>
## 1 miss  negative   1855
## 2 poor  negative    424
## 3 doubt negative    281

negative_words_4_Alice %>%
  select(word, sentiment, n) %>%
  arrange() %>% top_n(22, n) %>%
  mutate(word = reorder(word,n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() + 
  xlab(NULL) +
  coord_flip() +
  theme_classic() +
  labs(x = "__ __ ", y = "Frequency", 
       title = "Negative words: Alice in Wonderland")

Conclusion

From the analysis performed and without having looked at any Jane Austen publications, one might gather that “Mansfield Park” has a much more negative tone than “Persuasion”. One may also gather that the story of “Alison in Wonderland” is an uplifting story, with mostly feelings of being content.

It’s evident that stop words can certainly interfere with analysis. However, an argument can be made that not all stop words are necessarily negative, nor have equal weights–it’s all contextual within the literature. To get a more accurate model, analyzing surrounding words of the corpus via Natural Language Processing would enhance accuracy of the analysis.