Load Libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
library(stringr)

Mirroring Text Mining with R

Source: “Text Mining with R: A Tidy Approach” by Julia Silge and David Robinson

Get Sentiment of Jane Austen books

as shown in Text Mining with R

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Review sentiment of specifying books and compare different sentiment models

as shown in Text Mining with R

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparison of sentiment models across same corpus

as shown in Text Mining with R

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Load Stanford movie reviews dataset

Potts, Christopher. 2011. On the negativity of negation. In Nan Li and David Lutz, eds., Proceedings of Semantics and Linguistic Theory 20, 636-659.

load_reviews <- function(filepath,id) {
    file_str <- readr::read_file(filepath)
    if (str_detect(filepath,'pos')){
        sentiment <- 'positive'
    }
    else{
        sentiment <- 'negative'
    }
  df <- data.frame(id,gsub('.*/(.*)$','\\1',filepath),file_str,sentiment)
  cols = c('id','filename','text','load_sentiment')
  colnames(df) <- cols
  return(df)
}
#comment out below line if data is already loaded
mov_reviews <- textdata::dataset_imdb(dir= './Data',split='train')
pos_files <- list.files("./Data/imdb/aclImdb/train/pos", pattern="*.txt", full.names=TRUE)[1:10]
neg_files <- list.files("./Data/imdb/aclImdb/train/neg", pattern="*.txt", full.names=TRUE)[1:10]

df_reviews <- bind_rows(map2(c(pos_files[0:10],neg_files[0:10]),1:length(c(pos_files,neg_files)), load_reviews))

Compare pre-labeled dataset against two tidytext sentiment models

df_word_detail <- df_reviews %>%
  group_by(filename) %>%
  mutate(
    linenumber = row_number()) %>%
  ungroup() %>%
  unnest_tokens(word, text)


sentiment_reviews <- bind_rows(
  df_word_detail %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  df_word_detail %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = filename, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative) |>
  inner_join(df_reviews,by=c('index'='filename')) |>
      select(-c(id,text))
## Joining, by = "word"
## Joining, by = "word"
sentiment_reviews %>%
ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = load_sentiment),color='black',size=2,position=position_dodge(.9))+
  theme(axis.text.x = element_text(angle=90,hjust=0.9))+
  facet_wrap(~method, ncol = 1, scales = "free_y")

sentiment_reviews |>
    mutate(sentiment_match = if_else(sentiment==0,'N',if_else(sentiment<0,if_else(load_sentiment=='negative','Y','N'),if_else(load_sentiment=='positive','Y','N'))),total = n()/2) |> 
    group_by(sentiment_match,method) |>
    summarize(cnt = n(),match_perc=cnt/total) |>
    distinct(sentiment_match,method,cnt,match_perc) |>
    arrange(method)
## `summarise()` has grouped output by 'sentiment_match', 'method'. You can
## override using the `.groups` argument.
## # A tibble: 4 × 4
## # Groups:   sentiment_match, method [4]
##   sentiment_match method        cnt match_perc
##   <chr>           <chr>       <int>      <dbl>
## 1 N               Bing et al.     4        0.2
## 2 Y               Bing et al.    16        0.8
## 3 N               NRC             8        0.4
## 4 Y               NRC            12        0.6

The graph displays the total sentiment by individual word and although it presents the pre-labeled data it is not intuitive which sentiment model performed better than the other compared to what the expected results should have been. Therefore as a separate output when looking at the correct sentiment classification it appears that the Bing model far outperformed the NRC model with this specific movie review corpus.

Why is there such a discrepany of this magnitude?

There could be several reasons why there were different results, but perhaps the NRC did not train on as many words that were contained in these reviews thereby losing out on impactful words that might correctly drive sentiment. Alternatively, there could have been slightly unexpected usage of words or potential sarcasm that the model may not have taken into account by considering each individual word.