DATA607 - Assignment 10

Introduction

In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: 1. Work with a different corpus of your choosing, and 2. Incorporate at least one additional sentiment lexicon

Part 1

####Loading Libraries and sentiments

Tokenization of Books

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Joining on Joy Sentiments

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows

Finding Sentiment using Bing Sentiment Lexicon

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

Graphing Sentiments by Book

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")

Finding Sentiments Based on Each Method

afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

bing_and_nrc <- bind_rows(
  pride_prejudice %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
      filter(sentiment %in% c(
        "positive",
        "negative"
      ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

## Joining, by = "word"

Graphing the Sentiments of Each

bind_rows(
  afinn,
  bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(
    y = "Contribution to sentiment",
    x = NULL
  ) +
  coord_flip()

## Selecting by n

Adding to Stop Words

custom_stop_words <- bind_rows(
  tibble(
    word = c("miss"),
    lexicon = c("custom")
  ),
  stop_words
)

custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ... with 1,140 more rows

Wordclouds

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(
    colors = c("gray20", "gray80"),
    max.words = 100
  )

## Joining, by = "word"

Looking at units beyond just words

PandP_sentences <- tibble(text = prideprejudice) %>%
  unnest_tokens(sentence, text, token = "sentences")

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text,
    token = "regex",
    pattern = "Chapter|CHAPTER [\\dIVXLC]"
  ) %>%
  ungroup()

austen_chapters %>%
  group_by(book) %>%
  summarise(chapters = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` regrouping output by 'book' (override with `.groups` argument)

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords / words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"
## `summarise()` regrouping output by 'book' (override with `.groups` argument)

## Selecting by ratio

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Part 2

Transforming the Data Frame

# loading book
dorian <- gutenberg_works(title == 'The Picture of Dorian Gray')
dorian_corpus <- gutenberg_download(dorian$gutenberg_id)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

# creating a data frame by tokenization of chapters, paragraphs, and words
dc <- dorian_corpus %>%
  mutate(
    chapter_num = cumsum(str_detect(text, regex('^chapter [0-9]+', ignore_case = TRUE)))
  ) %>%
  filter(chapter_num != 0) %>%
  unnest_tokens(chapter, text, token = 'regex', pattern = 'CHAPTER [0-9]+') %>%
  unnest_tokens(paragraph, chapter, token = 'paragraphs') %>%
  group_by(chapter_num) %>%
  mutate(paragraph_num = 1:n()) %>%
  ungroup() %>%
  unnest_tokens(word, paragraph) %>%
  anti_join(stop_words)

## Joining, by = "word"

Exploratory

# Words by Chapter
dc %>% 
  group_by(chapter_num) %>%
  count(word, sort = TRUE) %>%
  mutate(chapter_num = reorder(chapter_num, n)) %>%
  ggplot(aes(chapter_num, n)) +
  geom_col(fill = 'steel blue') +
  ylab('Word Count') +
  coord_flip()

# Top 10 words in book
dc %>% 
  count(word, sort = TRUE) %>%
  top_n(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word,n)) +
  geom_col(fill = 'steel blue') +
  ylab('Word Count') +
  coord_flip()

## Selecting by n

# Top words by chapter
dc %>% 
  group_by(chapter_num) %>%
  count(word) %>%
  top_n(1) %>%
  ungroup() %>%
  arrange(chapter_num)

## Selecting by n

## # A tibble: 23 x 3
##    chapter_num word       n
##          <int> <chr>  <int>
##  1           1 dorian    22
##  2           1 harry     22
##  3           1 lord      22
##  4           2 dorian    43
##  5           3 lord      34
##  6           4 harry     29
##  7           5 mother    26
##  8           5 sibyl     26
##  9           6 dorian    26
## 10           7 dorian    25
## # ... with 13 more rows

Finding Sentiments

# getting sentiment of words storing as vector using syuzhet lexicon
sentiment_vector <- get_sentiment(dc$word, method = 'syuzhet')

dc <- cbind(dc, sentiment_vector)

# calculating sentiment by paragraph
dc_sent_para <- dc %>%
  group_by(chapter_num, paragraph_num) %>%
  summarize(sentiment = sum(sentiment_vector))

## `summarise()` regrouping output by 'chapter_num' (override with `.groups` argument)

Analysis

# now the time that we've been waiting for
# graph using syuzhet lexicon
ggplot(data = dc_sent_para, aes(paragraph_num, sentiment, fill = chapter_num)) + 
  geom_col() + 
  facet_wrap(~ chapter_num)

# graph of proportion of positive and negative paragraphs
dc_sent_para %>%
  mutate(sentiment2 = ifelse(sentiment >= 0, 'positve', 'negative')) %>%
  group_by(sentiment2) %>%
  summarize(n = n()) %>%
  mutate(freq = round(n / sum(n),2)) %>%
  ggplot(aes(x = sentiment2, y = freq)) +
  geom_col(fill = 'steelblue') +
  geom_text(aes(label = freq), vjust = 2, color = 'white', size = 5)

## `summarise()` ungrouping output (override with `.groups` argument)

# wordcloud using loughran lexicon
dc %>%
  inner_join(get_sentiments('loughran')) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = 'n', fill = 0) %>%
  comparison.cloud(
    colors = c("gray20", "gray80"),
    max.words = 100
  )

## Joining, by = "word"

Conclusion

“The Picture of Dorian Gray” has almost the equal amounts of positive and negative sentiment paragraphs. What is surprising is that the last chapter of the book contains almost only negative sentiment paragraphs.