PART 2 OF 2 SENTIMENT ANALYSIS

My corpus is from The Great Gatsby by F. Scott Fitzgerald.

About the new corpus and lexicon

In Part 1, the janeaustenr package was used to explore tidying text from her novels. For this assignment, I used the gutenbergr package (Robinson 2016) to locate the book, “The Great Gatsby” by F. Scott Fitzgerald. The gutenbergr package provides access to the public domain works from the Project Gutenberg collection. The package includes tools both for downloading books and a complete dataset of Project Gutenberg metadata that can be used to find works of interest.

Installation and loading gutenbergr package

#devtools::install_github("ropensci/gutenbergr", force = TRUE)
library(gutenbergr)
library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.0     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.3     v tibble    3.1.8
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(tidytext)
library(textdata)

Sentiment Analysis for F. Scott Fitzgerald’s “The Great Gatsby”.

  # Load the book, tidying the text into word token

gatsby <- gutenberg_download(509)    #The Great Gatsby by F. Scott Fitzgerald
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_gatsby <- gatsby |>
  unnest_tokens(word,text) |>
  anti_join(stop_words)
## Joining with `by = join_by(word)`
  # Most common words in the book

tidy_gatsby|>
  count(word, sort = TRUE)
## # A tibble: 4,982 x 2
##    word           n
##    <chr>      <int>
##  1 o'connor      90
##  2 time          89
##  3 sir           75
##  4 castle        64
##  5 lady          62
##  6 fitzgerald    52
##  7 door          50
##  8 iv            46
##  9 le            44
## 10 robert        44
## # i 4,972 more rows
  # wordcloud of most frequent words

tidy_gatsby |>
  count(word) |>
  with(wordcloud(word, n, max.words = 75))

Using the “louhgran” lexicon

get_sentiments("loughran") |> 
  filter(sentiment %in% c("positive", "negative")) |> 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   2355
## 2 positive    354
sentiment_positive <- get_sentiments("loughran") |> 
  filter(sentiment == "positive")

tidy_gatsby |>
  inner_join(sentiment_positive) |>
  count(word, sort=TRUE) |> 
ungroup()
## Joining with `by = join_by(word)`
## # A tibble: 83 x 2
##    word           n
##    <chr>      <int>
##  1 perfectly     19
##  2 dream          9
##  3 beautiful      8
##  4 excitement     8
##  5 strength       8
##  6 strong         8
##  7 easily         6
##  8 perfect        6
##  9 pleasure       6
## 10 succeeded      6
## # i 73 more rows

Create a wordcloud of most frequent words split by loughran sentiment

tidy_gatsby|>
  inner_join(get_sentiments("loughran")) |>
  count(word, sentiment, sort = TRUE) |>
  acast(word ~ sentiment, value.var = "n", fill = 0) |>
  comparison.cloud(colors = c("blue", "maroon"),
                   max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
## Warning in comparison.cloud(acast(count(inner_join(tidy_gatsby,
## get_sentiments("loughran")), : prejudices could not be fit on page. It will not
## be plotted.

Find the contribution of words to different sentiments with loughran lexicon

loughran_word_counts <- tidy_gatsby |>
  inner_join(get_sentiments("loughran")) |>
  count(word, sentiment, sort = TRUE) |>
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.
loughran_word_counts
## # A tibble: 448 x 3
##    word       sentiment       n
##    <chr>      <chr>       <int>
##  1 appeared   uncertainty    32
##  2 lost       negative       24
##  3 poor       negative       23
##  4 perfectly  positive       19
##  5 doubt      negative       13
##  6 doubt      uncertainty    13
##  7 late       negative       13
##  8 miss       negative       12
##  9 broken     negative       10
## 10 difficulty negative       10
## # i 438 more rows
loughran_word_counts |>
  group_by(sentiment) |>
  slice_max(n, n = 5) |> 
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Begin comparison of using bing lexicon to categorize words as positive and negative

bing_word_counts <- tidy_gatsby |>
  inner_join(get_sentiments("bing")) |>
  count(word, sentiment, sort = TRUE) |>
  ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 1,040 x 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 death     negative     38
##  2 lost      negative     24
##  3 poor      negative     23
##  4 strange   negative     21
##  5 perfectly positive     19
##  6 dead      negative     17
##  7 silent    positive     15
##  8 object    negative     14
##  9 vain      negative     14
## 10 beauty    positive     13
## # i 1,030 more rows
bing_word_counts |>
  group_by(sentiment) |>
  slice_max(n, n = 10) |> 
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Finding joy words using nrc lexicon

nrc_joy <- get_sentiments("nrc") |> 
  filter(sentiment == "joy")

tidy_gatsby |>
  inner_join(nrc_joy) |>
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 215 x 2
##    word        n
##    <chr>   <int>
##  1 god        35
##  2 friend     30
##  3 found      21
##  4 hope       19
##  5 mother     18
##  6 child      15
##  7 feeling    15
##  8 beauty     13
##  9 love       10
## 10 mighty      9
## # i 205 more rows

Based on the analysis above, one can clearly see that the lexicon can have a big impact on the analysis. Thus careful consideration should be taken prior to undertaking this type of any analysis.