Installation and loading gutenbergr package

#devtools::install_github("ropensci/gutenbergr", force = TRUE)
library(gutenbergr)
library(tidyverse)

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.0     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.3     v tibble    3.1.8
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.2     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyr)
library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(tidytext)
library(textdata)

Sentiment Analysis for F. Scott Fitzgerald’s “The Great Gatsby”.

  # Load the book, tidying the text into word token

gatsby <- gutenberg_download(509)    #The Great Gatsby by F. Scott Fitzgerald

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

tidy_gatsby <- gatsby |>
  unnest_tokens(word,text) |>
  anti_join(stop_words)

## Joining with `by = join_by(word)`

  # Most common words in the book

tidy_gatsby|>
  count(word, sort = TRUE)

## # A tibble: 4,982 x 2
##    word           n
##    <chr>      <int>
##  1 o'connor      90
##  2 time          89
##  3 sir           75
##  4 castle        64
##  5 lady          62
##  6 fitzgerald    52
##  7 door          50
##  8 iv            46
##  9 le            44
## 10 robert        44
## # i 4,972 more rows

  # wordcloud of most frequent words

tidy_gatsby |>
  count(word) |>
  with(wordcloud(word, n, max.words = 75))

Using the “louhgran” lexicon

get_sentiments("loughran") |> 
  filter(sentiment %in% c("positive", "negative")) |> 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   2355
## 2 positive    354

sentiment_positive <- get_sentiments("loughran") |> 
  filter(sentiment == "positive")

tidy_gatsby |>
  inner_join(sentiment_positive) |>
  count(word, sort=TRUE) |> 
ungroup()

## Joining with `by = join_by(word)`

## # A tibble: 83 x 2
##    word           n
##    <chr>      <int>
##  1 perfectly     19
##  2 dream          9
##  3 beautiful      8
##  4 excitement     8
##  5 strength       8
##  6 strong         8
##  7 easily         6
##  8 perfect        6
##  9 pleasure       6
## 10 succeeded      6
## # i 73 more rows

Create a wordcloud of most frequent words split by loughran sentiment

tidy_gatsby|>
  inner_join(get_sentiments("loughran")) |>
  count(word, sentiment, sort = TRUE) |>
  acast(word ~ sentiment, value.var = "n", fill = 0) |>
  comparison.cloud(colors = c("blue", "maroon"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.

## Warning in comparison.cloud(acast(count(inner_join(tidy_gatsby,
## get_sentiments("loughran")), : prejudices could not be fit on page. It will not
## be plotted.

Find the contribution of words to different sentiments with loughran lexicon

loughran_word_counts <- tidy_gatsby |>
  inner_join(get_sentiments("loughran")) |>
  count(word, sentiment, sort = TRUE) |>
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(tidy_gatsby, get_sentiments("loughran")): Each row in `x` is expected to match at most 1 row in `y`.
## i Row 51 of `x` matches multiple rows.
## i If multiple matches are expected, set `multiple = "all"` to silence this
##   warning.

loughran_word_counts

## # A tibble: 448 x 3
##    word       sentiment       n
##    <chr>      <chr>       <int>
##  1 appeared   uncertainty    32
##  2 lost       negative       24
##  3 poor       negative       23
##  4 perfectly  positive       19
##  5 doubt      negative       13
##  6 doubt      uncertainty    13
##  7 late       negative       13
##  8 miss       negative       12
##  9 broken     negative       10
## 10 difficulty negative       10
## # i 438 more rows

loughran_word_counts |>
  group_by(sentiment) |>
  slice_max(n, n = 5) |> 
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Begin comparison of using bing lexicon to categorize words as positive and negative

bing_word_counts <- tidy_gatsby |>
  inner_join(get_sentiments("bing")) |>
  count(word, sentiment, sort = TRUE) |>
  ungroup()

## Joining with `by = join_by(word)`

bing_word_counts

## # A tibble: 1,040 x 3
##    word      sentiment     n
##    <chr>     <chr>     <int>
##  1 death     negative     38
##  2 lost      negative     24
##  3 poor      negative     23
##  4 strange   negative     21
##  5 perfectly positive     19
##  6 dead      negative     17
##  7 silent    positive     15
##  8 object    negative     14
##  9 vain      negative     14
## 10 beauty    positive     13
## # i 1,030 more rows

bing_word_counts |>
  group_by(sentiment) |>
  slice_max(n, n = 10) |> 
  ungroup() |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Finding joy words using nrc lexicon

nrc_joy <- get_sentiments("nrc") |> 
  filter(sentiment == "joy")

tidy_gatsby |>
  inner_join(nrc_joy) |>
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 215 x 2
##    word        n
##    <chr>   <int>
##  1 god        35
##  2 friend     30
##  3 found      21
##  4 hope       19
##  5 mother     18
##  6 child      15
##  7 feeling    15
##  8 beauty     13
##  9 love       10
## 10 mighty      9
## # i 205 more rows

Based on the analysis above, one can clearly see that the lexicon can have a big impact on the analysis. Thus careful consideration should be taken prior to undertaking this type of any analysis.

Data 607 Sentiment Analysis, Part 2

Carol Campbell

2023-12-11

PART 2 OF 2 SENTIMENT ANALYSIS

My corpus is from The Great Gatsby by F. Scott Fitzgerald.

About the new corpus and lexicon

Installation and loading gutenbergr package

Sentiment Analysis for F. Scott Fitzgerald’s “The Great Gatsby”.

Using the “louhgran” lexicon

Create a wordcloud of most frequent words split by loughran sentiment

Find the contribution of words to different sentiments with loughran lexicon

Begin comparison of using bing lexicon to categorize words as positive and negative

Finding joy words using nrc lexicon

Based on the analysis above, one can clearly see that the lexicon can have a big impact on the analysis. Thus careful consideration should be taken prior to undertaking this type of any analysis.