data_607_assignment

The following code block loads packages.

library(tidytext)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(wordcloud)

## Loading required package: RColorBrewer

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(xml2)

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

The following code block loads Lewis Carroll books from the gutenberg project website.

#The selected titles were not available from the gutenberg project package so the following code imports them from the gutenberg project website

wonderland <- as.data.frame(read_lines("https://www.gutenberg.org/cache/epub/11/pg11.txt"))

wonderland <- wonderland |> mutate(
  book = "Alice's Adventures in Wonderland")

colnames(wonderland)[1] = "text"

looking_glass <- as.data.frame(read_lines("https://www.gutenberg.org/cache/epub/12/pg12.txt"))

looking_glass <- looking_glass |> mutate(
  book = "Through the Looking-Glass"
)

colnames(looking_glass)[1] = "text"

carroll_raw <- wonderland |> bind_rows(looking_glass)

Adapted Sentiment Analysis

Much of the following code is adapted from “Text Mining with R: A Tidy Approach” by Silge & Robinson, available at https://www.tidytextmining.com/

The following code creates a tidy dataframe for the Lewis Carroll texts.

tidy_carroll <- carroll_raw |> 
  group_by(book) |> 
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text,
                                regex("(?i)^chapter [\\divxlc]\\.")))) |> 
  ungroup()

tidy_carroll <- unnest_tokens(tidy_carroll,word,text)

The following code filters the terms labelled for “joy” from the nrc sentiment library.

nrc_joy <- get_sentiments("nrc") |> 
  filter(sentiment == "joy")

tidy_carroll |> 
  filter(book == "Alice's Adventures in Wonderland") |> 
  inner_join(nrc_joy, by = join_by(word)) |> 
  count(word, sort = TRUE)

## # A tibble: 100 × 2
##    word          n
##    <chr>     <int>
##  1 found        34
##  2 good         27
##  3 garden       15
##  4 baby         14
##  5 beautiful    13
##  6 dance        13
##  7 grow         13
##  8 deal         12
##  9 child        11
## 10 glad         11
## # ℹ 90 more rows

The following code analyzes the sentiment for the Lewis Carroll texts using the “bing” library in 80 line sections.

lewis_carroll_sentiment <- tidy_carroll |> 
  inner_join(get_sentiments("bing")) |> 
  count(book, index = linenumber %/% 80, sentiment) |> 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |> 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(tidy_carroll, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 30369 of `x` matches multiple rows in `y`.
## ℹ Row 6640 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

The following code block plots the sentiment of the two Lewis Carroll texts over the course of the book using the “bing” sentiment library.

ggplot(lewis_carroll_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

The following code analyzes sentiments using the three sentiment lexicons for Alice’s Adventures in Wonderland: afinn, bing, and nrc.

alice_wonderland <- tidy_carroll |> 
  filter(book == "Alice's Adventures in Wonderland")

afinn <- alice_wonderland |> 
  inner_join(get_sentiments("afinn")) |> 
  group_by(index = linenumber %/% 80) |> 
  summarise(sentiment = sum(value)) |> 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  alice_wonderland |> 
    inner_join(get_sentiments("bing")) |> 
    mutate(method = "Bing et al."),
  alice_wonderland |> 
  inner_join(get_sentiments("nrc") |> 
               filter(sentiment %in% c("positive",
                                       "negative"))) |> 
    mutate(method = "NRC")) |> 
  count(method, index = linenumber %/% 80, sentiment) |> 
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) |> 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(alice_wonderland, filter(get_sentiments("nrc"), sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1863 of `x` matches multiple rows in `y`.
## ℹ Row 5297 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

The following code plots the sentiment for each of the lexicons’ analysis of Alice’s Adventures in Wonderland. As can be seen NRC has an overall more positive sentiment but all three show a similar fluctuation in sentiment.

bind_rows(afinn, bing_and_nrc) |> 
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

The counts below show the differences in negative vs. positive word counts in the nrc and bing lexicons.

get_sentiments("nrc") |> 
  filter(sentiment %in% c("positive", "negative")) |> 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") |> 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

The following code shows the most common words in Lewis Carroll’s texts.

bing_word_counts <- tidy_carroll |> 
  inner_join(get_sentiments("bing")) |> 
  count(word, sentiment, sort = TRUE) |> 
  ungroup() |> print()

## Joining with `by = join_by(word)`

## Warning in inner_join(tidy_carroll, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 30369 of `x` matches multiple rows in `y`.
## ℹ Row 6640 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## # A tibble: 711 × 3
##    word   sentiment     n
##    <chr>  <chr>     <int>
##  1 like   positive    207
##  2 well   positive    126
##  3 work   positive    104
##  4 great  positive     89
##  5 works  positive     70
##  6 good   positive     64
##  7 right  positive     59
##  8 mock   negative     57
##  9 poor   negative     49
## 10 better positive     44
## # ℹ 701 more rows

The following codes plots the most common positive and negative words in the Lewis Carroll’s texts.

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("gutenberg"),  
                                      lexicon = c("custom")), 
                               stop_words)

The following code creates word clouds from the Carroll texts.

tidy_carroll |> 
  anti_join(custom_stop_words) |> 
  count(word) |> 
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

The following code creates a word cloud of the most common positive and negative sentiment words in the Lewis Carroll works.

tidy_carroll %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 30369 of `x` matches multiple rows in `y`.
## ℹ Row 6640 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Original Sentiment Analysis

The following code uses the sentimentr package to analyze the Lewis Carroll text by sentence.

This loads the sentimentr package

library(sentimentr)

The following code loads the Lewis Carroll texts, separates them by sentence and combines them into a dataframe for analysis. This dataframe is, for these purposes, is tidy.

wonder <- read_lines("https://www.gutenberg.org/cache/epub/11/pg11.txt")

wonder <- str_c(wonder, collapse = " ")

wonder <- as.data.frame(wonder, nm = "text") |> unnest_tokens(sentence, text, token = "sentences") |>  separate_longer_delim(sentence, delim = "chapter") |> mutate(
  sentence = str_replace_all(sentence,"\\s+([ivxlc]+)","chapter \\1"),
  sentence = str_replace_all(sentence, "([a-z\\W_])chapter", "\\1 "),
  book = "Alice's Adventures in Wonderland") |> 
  filter(row_number() > 29) |> 
  mutate(chapter = cumsum(str_detect(sentence,
                                regex("^chapter")))) |> 
  group_by(chapter) |> 
  mutate(sentence_no = row_number()) |> ungroup()

glass <- read_lines("https://www.gutenberg.org/cache/epub/12/pg12.txt")

glass <- str_c(glass, collapse = " ")

glass <- as.data.frame(glass, nm = "text") |> unnest_tokens(sentence, text, token = "sentences") |>  separate_longer_delim(sentence, delim = "chapter") |> mutate(
  sentence = str_replace_all(sentence,"\\s+([ivxlc]+)","chapter \\1"),
  sentence = str_replace_all(sentence, "([a-z\\W_])chapter", "\\1 "),
  book = "Through the Looking-Glass") |> 
  filter(row_number() > 129) |> 
  mutate(chapter = cumsum(str_detect(sentence,
                                regex("^chapter")))) |> 
  group_by(chapter) |> 
  mutate(sentence_no = row_number()) |> ungroup()

carroll_sentences <- wonder |> bind_rows(glass)

The following code uses sentimentr to analyze the sentiment of the Lewis Carroll works by chapter.

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

carroll_sentiment_chapter <- carroll_sentences |> 
  get_sentences() %$% 
  sentiment_by(sentence, list(book, chapter))

head(carroll_sentiment_chapter)

## Key: <book, chapter>
##                                book chapter word_count        sd ave_sentiment
##                              <char>   <int>      <int>     <num>         <num>
## 1: Alice's Adventures in Wonderland       1       2195 0.2769986   0.099602456
## 2: Alice's Adventures in Wonderland       2       2188 0.2926723  -0.032208215
## 3: Alice's Adventures in Wonderland       3       1741 0.2891973  -0.037590276
## 4: Alice's Adventures in Wonderland       4       2740 0.2154387   0.016102891
## 5: Alice's Adventures in Wonderland       5       2245 0.2392938  -0.007291635
## 6: Alice's Adventures in Wonderland       6       2672 0.3035419   0.010506532

The following plot shows the average sentiment throughout the two books by chapter.

 ggplot(carroll_sentiment_chapter, aes(chapter, ave_sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 1, scales = "free_y")

data_607_assignment_10

Maxfield Raynolds

2025-04-09

Adapted Sentiment Analysis

Original Sentiment Analysis