DATA 607 Week 10 JB

##Packages

library(tidyverse)
library(stringr)
library(tidytext)
library(janeaustenr)
library(textdata)
library(gutenbergr)

##Example from book:

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Source:http://saifmohammad.com/WebDocs/Lexicons/NRC-Emotion-Lexicon.zip

Citation Info: Version: 0.92 Publicly Released: 10 July 2011 Created By: Dr. Saif M. Mohammad, Dr. Peter Turney Home Page: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

#Using NRC

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

##Using Bing

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

##Austen Sentiment plot

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

##Comparison of the sentiment dictionarie

Entirety of P&P book text

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

##AFINN, BING an NRC sentiments

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

Plot to compare

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

The three sentiment dictionaries show overall that there is similar positive and negative trajectory throughout Austen’s Pride and Predudice novel. Upon further examination however, NRC is lacking in representiaion of negative values, while AFINN and Bing both represent negative better. This is likely due to the structure of NRC compared to the others and how is defines negative and positives.

##Examining a separate text

1952, 60481, 3015 Gutenburg Book Codes.

I will be using Project Gutenburg and its R package to examine sentiment of a different author, Charlotte Perkins Gilman.

(afinn <- get_sentiments("afinn"))

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

(bing <- get_sentiments("bing"))

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

(nrc <- get_sentiments("nrc"))

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

unique(nrc$sentiment)

##  [1] "trust"        "fear"         "negative"     "sadness"      "anger"       
##  [6] "surprise"     "positive"     "disgust"      "joy"          "anticipation"

Iteration for loop to get the correct names replaced for analysis down the road.

C_P_G <- gutenberg_download(c(1952, 60481, 3015))

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

replace_values <- c(3015, 60481, 1952)
replacement_names <- c("The Man-Made World", "In this our world", "The Yellow Wallpaper")

#Need to change the names for a cross analysis later

for (i in seq_along(replace_values)) {
  C_P_G$gutenberg_id[C_P_G$gutenberg_id == replace_values[i]] <- replacement_names[i]
}

tidy_CPG <- C_P_G %>% 
    unnest_tokens(word, text) %>% 
    anti_join(stop_words)

## Joining with `by = join_by(word)`

Most common words?

The Man-Made World - 3015 In this our world - 60481 The Yellow Wallpaper - 1952

tidy_CPG %>% 
    count(word, sort = T)

## # A tibble: 8,256 × 2
##    word          n
##    <chr>     <int>
##  1 women       400
##  2 human       353
##  3 life        286
##  4 love        225
##  5 world       213
##  6 day         137
##  7 masculine   136
##  8 woman       136
##  9 male        134
## 10 social      124
## # ℹ 8,246 more rows

tidy_CPG <- C_P_G %>% 
    rename("book" = "gutenberg_id" ) %>% 
    group_by(book) %>% 
    mutate(
        linenumber = row_number()) %>% 
    ungroup() %>% 
    unnest_tokens(word, text)

CPG_sentiment <- tidy_CPG %>%
  inner_join(bing) %>%
  count(book, index = linenumber %/% 30, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

ggplot(CPG_sentiment, aes(index, sentiment, fill = book)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~book, ncol = 2, scales = "free_x")

We looked at anger how about sadness? Lets visualize sadness like we did for anger. But try using Afinn instead of NRC:

afinn_sad <- tidy_CPG %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 10) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

ggplot(afinn_sad, aes(index, sentiment, fill = "method"))+
  geom_col(show.legend = F) +
  facet_wrap(~method, scales = "free_y")

Does this change with NRC and Bing?

sadness_nrc <- nrc %>%
    filter(sentiment == "sadness")

tidy_CPG %>% 
    inner_join(sadness_nrc) %>% 
    count(word, sort = T)

## Joining with `by = join_by(word)`

## # A tibble: 406 × 2
##    word         n
##    <chr>    <int>
##  1 mother     109
##  2 art         82
##  3 pain        59
##  4 death       40
##  5 sin         35
##  6 die         34
##  7 warfare     32
##  8 blue        27
##  9 shame       24
## 10 struggle    24
## # ℹ 396 more rows

CPG_sent <- tidy_CPG %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 10, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

ggplot(CPG_sent, aes(index, sentiment, fill = book))+
  geom_col(show.legend = F) +
  facet_wrap(~book, scales = "free_y", nrow = 2 )

##Conclusion: The data above further highlights the length difference and how it can effect visualization and understanding of these text mining packages. Overall, the first iteration of using Bing has provided the best look into the three books from Charlotte Gilman. I wanted to use Gilmans works because The Wallpaper was one of the darkest things ive read, and I wanted to see if these lexicons could pick up on that. Also see if there were any significant trends with her other work. While it isnt apparent in her other work, the analysis definitely shows that her Yellow Wallpaper short story is much darker than the others.

DATA 607 Week 10 JB

Jonathan Burns

2023-11-11