Andrew Bowen DATA 607 HW 7

Using the sample code from Chapter 2 of Text Mining with R

library(tidytext)
library(tidyr)
library(janeaustenr)
library(dplyr)
library(stringr)
library(ggplot2)
library(wordcloud)
library(reshape2)
library(glue)

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,862 more rows

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining, by = "word"

# Plotting the sentiment scores
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing NRC lexibon to others

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

Generating word clouds

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

Now we’ll look at a sentence level rather than a word level

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

# can also split tokens via regex
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Part 1

Using a different corpus and sentiment lexicon, as well as corpus. For my corpus/dataset, I chose the Harry Potter novels, included in the harrypotter library. I’ll also be using the Jockers lexicon included in the lexicon package, which includes polarity scores as lookup values for 11,710 words.

library(harrypotter)
library(sentimentr)
library(lexicon)

## 
## Attaching package: 'lexicon'

## The following object is masked from 'package:sentimentr':
## 
##     available_data

Getting Jockers lexicon into a dataframe and renaming columns for readability

j_lexicon <- as.data.frame(lexicon::hash_sentiment_jockers)
j_lexicon <- j_lexicon %>%
          rename("word" = "x", "score" = "y")

We’ll be using the text of Harry Potter and the Chamber of Secrets in our example. We pulled the corpus from this repo of .rda files included in the containing the text of each Harry Potter book (7 in total). Inorder to make this code reproducible, I used the script exxtract_hp_data.R included in this repo to pull in the .Rda data and dump to csv files hosted on my GitHub (broken out by chapter).

Each csv in the linked repo corresponds to one chapter of a book from the Harry Potter series. For instance, chapter 5 of the Chamber of Secrets would be contained in the chamber_of_secrets5.csv file

First, let’s read in each csv file and dump them into a single dataframe, including book labels for plotting later.

chapter_lengths <- c("philosophers_stone" = seq(1, 17),
                     "chamber_of_secrets" = seq(1, 19),
                     "prisoner_of_azkaban" = seq(1, 22),
                     "goblet_of_fire" = seq(1, 37),
                     "order_of_the_phoenix" = seq(1, 22),
                     "half_blood_prince" = seq(1, 30),
                     "deathly_hallows" = seq(1, 37)
                     )

# Looping over each of our chapter files within our GitHub repo
# appending each corresponding csv's dataframe to our overall one
datalist = vector("list", length = length(names(chapter_lengths)))
i <- 1
for (chapter in names(chapter_lengths)){
  url <- glue("https://raw.githubusercontent.com/andrewbowen19/cunyDATA607/main/data/harrypotter/{chapter}.csv", header=TRUE)
  book_name <- str_replace_all(chapter, "\\d{1,2}", "")
  chapter_df <- read.csv(url, sep=",",  header=TRUE) %>%
                mutate(book = book_name)

  # chapter_df$book <- seq
  
  datalist[[i]] <- chapter_df
  
  i <- i + 1

}

df <- do.call(rbind, datalist)
head(df)

##   X      word score  n               book
## 1 1 professor  0.40 30 philosophers_stone
## 2 2       yes  0.80 11 philosophers_stone
## 3 3       boy  0.25  9 philosophers_stone
## 4 4      like  0.50  9 philosophers_stone
## 5 5      well  0.80  9 philosophers_stone
## 6 6      good  0.75  7 philosophers_stone

# text_cs <- tibble(chapter = seq_along(chamber_of_secrets),
#                   text = chamber_of_secrets) %>%
#                   unnest_tokens(word, text) %>%
#                   mutate(book = "Chamber of Secrets")

Doing an inner join to get our lexicon scores matched up with the words in the text

# cs_scores <- text_cs %>%
#   inner_join(j_lexicon) %>%
#   count(word, score,  sort = TRUE) %>%
#   ungroup()

Plotting our results as done above in the Jane Austen example code - these sentiment scores per word are a bit busy.

df %>% 
      mutate(index = row_number()) %>%
      ggplot( aes(index, score)) +
        geom_col(show.legend = FALSE)

Let’s re-create the above plot with the scores colored by book.

df %>% 
      mutate(index = row_number()) %>%
      ggplot( aes(index, score, fill=book)) +
        geom_col( )

Here is the same as above (sentiment scores per word per book) broken out into columns (as above)

df_sentiment <- df %>%
  left_join(j_lexicon) %>%
  count(book, index = c(1:nrow(df)), score)

## Joining, by = c("word", "score")

# Plotting the sentiment scores
ggplot(df_sentiment, aes(index, score, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

I’d also

ggplot(df, aes(x=score)) + geom_histogram(binwidth=0.1)

df %>%
  group_by(book) %>%
  summarise(mean_score = mean(score)) %>%
  ggplot(aes(x = book, y = mean_score)) + geom_bar(stat='identity')

Bibliography

(J. Silge 2022)

J. Silge, D Robinson. 2022. Text Mining with r: A Tidy Approach. https://www.tidytextmining.com/sentiment.html.

Andrew Bowen DATA 607 HW 7

Andrew Bowen

2022-11-01

Part 1

Bibliography