Overview

Get the primary example code from Chapter 2 of Text Mining with R working and provide a citation for this base code.

Intro

First, let’s take a look at the sentiment analysis conducted in Chapter 2 of Text Mining with R:

Robinson, David & Silge, Julia. (2017). Text Mining with R: A Tidy Approach. O’Reilly Media. https://www.tidytextmining.com/sentiment.html

library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.4
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# I've thus assigned a word per row in this dataset, and now I can attempt some sentiment analysis. 

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Next we can distill the dataset to the most common positive and negative words

get_sentiments("nrc") %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows

Lastly, let’s look at it graphically

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(
    y = "Contribution to sentiment",
    x = NULL
  ) +
  coord_flip()
## Selecting by n

# Let's filter out the stop words and produce a word cloud
custom_stop_words <- bind_rows(
  tibble(
    word = c("miss"),
    lexicon = c("custom")
  ),
  stop_words
)

#custom_stop_words
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.3
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

Next we can discern between positive or negative words in our wordcloud

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

Let’s look at the units beyond words

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]
## [1] "by jane austen"
#> [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
#> # A tibble: 6 x 2
#>   book                chapters
#>   <fct>                  <int>
#> 1 Sense & Sensibility       51
#> 2 Pride & Prejudice         62
#> 3 Mansfield Park            49
#> 4 Emma                      56
#> 5 Northanger Abbey          32
#> 6 Persuasion                25

New Corpus

This dataset comes from kaggle and is a collection of customer sentiment pulled from Rotten Tomatoes. It includes language from the most popular review for each movie.

reviews <- read.csv("https://raw.githubusercontent.com/evanmclaughlin/ECM607/master/RT_movie_reviews.csv")

head(reviews)
##                    rotten_tomatoes_link
## 1                             m/0814255
## 2                             m/0878835
## 3                                  m/10
## 4                m/1000013-12_angry_men
## 5 m/1000079-20000_leagues_under_the_sea
## 6                            m/10000_bc
##                                          movie_title
## 1 Percy Jackson & the Olympians: The Lightning Thief
## 2                                        Please Give
## 3                                                 10
## 4                    12 Angry Men (Twelve Angry Men)
## 5                       20,000 Leagues Under The Sea
## 6                                        10,000 B.C.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  movie_info
## 1                                    Always trouble-prone, the life of teenager Percy Jackson (Logan Lerman) gets a lot more complicated when he learns he's the son of the Greek god Poseidon. At a training ground for the children of deities, Percy learns to harness his divine powers and prepare for the adventure of a lifetime: he must prevent a feud among the Olympians from erupting into a devastating war on Earth, and rescue his mother from the clutches of Hades, god of the underworld.
## 2    Kate (Catherine Keener) and her husband Alex (Oliver Platt) are wealthy New Yorkers who prowl estate sales and make a tidy profit reselling items they bought cheaply. They buy the apartment next door and plan to remodel just as soon as its current occupant, a cranky old woman, dies. Kate is troubled by the way she and her husband earn a living, and tries to assuage her guilt by befriending her tenant and the woman's granddaughters, but her overtures lead to unexpected consequences.
## 3                                                                                                                                                                                                                   A successful, middle-aged Hollywood songwriter falls hopelessly in love with the woman of his dreams, and even follows the girl and her new husband to their Mexican honeymoon resort. While his behavior seems sure to land him in trouble, out of the blue fate plays into his hands.
## 4                                        Following the closing arguments in a murder trial, the 12 members of the jury must deliberate, with a guilty verdict meaning death for the accused, an inner-city teen. As the dozen men try to reach a unanimous decision while sequestered in a room, one juror (Henry Fonda) casts considerable doubt on elements of the case. Personal issues soon rise to the surface, and conflict threatens to derail the delicate process that will decide one boy's fate.
## 5 In 1866, Professor Pierre M. Aronnax (Paul Lukas) and his assistant Conseil (Peter Lorre), stranded in San Francisco by reports of a giant sea monster attacking ships in the Pacific Ocean, are invited to join an expedition to search for the creature. During the search, they and harpooner Ned Land (Kirk Douglas) are thrown overboard during an attack, eventually discovering that the supposed monster is actually a submarine piloted by the brilliant but haunted Captain Nemo (James Mason).
## 6                                                                                                                                                                                                            Mammoth hunter D'Leh (Steven Strait) has long been in love with a beautiful, blue-eyed tribeswoman named Evolet (Camilla Belle). After horseback-riding raiders kidnap most of his D'Leh's fellow tribesmen as well as Evolet, he sets out on a dangerous trek to rescue her from her captors.
##                                                                                                                                                                   critics_consensus
## 1          Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a speedy plot, and plenty of fun with Greek mythology.
## 2 Nicole Holofcener's newest might seem slight in places, but its rendering of complex characters in a conflicted economic landscape is varied, natural, and touching all the same.
## 3                             Blake Edwards' bawdy comedy may not score a perfect 10, but Dudley Moore's self-deprecating performance makes this midlife crisis persistently funny.
## 4                                         Sidney Lumet's feature debut is a superbly written, dramatically effective courtroom thriller that rightfully stands as a modern classic.
## 5          One of Disney's finest live-action adventures, 20,000 Leagues Under the Sea brings Jules Verne's classic sci-fi tale to vivid life, and features an awesome giant squid.
## 6                             With attention strictly paid to style instead of substance, or historical accuracy, 10,000 B.C. is a visually impressive but narratively flimsy epic.
##   content_rating                                                       genres
## 1             PG Action & Adventure, Comedy, Drama, Science Fiction & Fantasy
## 2              R                                                       Comedy
## 3              R                                              Comedy, Romance
## 4             NR                                              Classics, Drama
## 5              G                     Action & Adventure, Drama, Kids & Family
## 6          PG-13                          Action & Adventure, Classics, Drama
##           directors                                    authors
## 1    Chris Columbus Craig Titley, Chris Columbus, Rick Riordan
## 2 Nicole Holofcener                          Nicole Holofcener
## 3     Blake Edwards                              Blake Edwards
## 4      Sidney Lumet                              Reginald Rose
## 5 Richard Fleischer                                Earl Felton
## 6   Roland Emmerich             Harald Kloser, Roland Emmerich
actors
## 1 Logan Lerman, Brandon T. Jackson, Alexandra Daddario, Jake Abel, Sean Bean, Pierce Brosnan, Steve Coogan, Rosario Dawson, Melina Kanakaredes, Catherine Keener, Kevin Mckidd, Joe Pantoliano, Uma Thurman, Ray Winstone, Julian Richings, Bonita Friedericy, Annie Ilonzeh, Tania Saulnier, Marie Avgeropoulos, Luisa D'Oliveira, Christie Laing, Marielle Jaffe, Elisa King, Chrystal Tisiga, Alexis Knapp, Charlie Gallant, Chelan Simmons, Andrea Brooks, Natassia Malthe, Max Van Ville, Serinda Swan, Dimitri Lekkos, Ona Grauer, Stefanie von Pfetten, Conrad Coates, Erica Cerra, Dylan Neal, Luke Camilleri, Holly Hougham, Ina Geraldine, Raquel Riskin, Yusleidis Oquendo, Janine Edwards, Valerie Tian, Violet Columbus, Sarah Smyth, Merritt Patterson, Julie Luck, Andrea Day, John Stewart, Suzanne Ristic, Deejay Jackson, Matthew Garrick, Stan Carp, Suzanna Ristic, Richard Harmon, Maria Olsen, Robin Lemon, Doyle Devereux, Tom Pickett, VJ Delos-Reyes, Tim Aas, Keith Dallas, Spencer Atkinson, Maya Washington, Loyd Bateman, Victor Ayala, Zane Holtz, Eli Zagoudakis, Matt Reimer, Rob Hayter, Lloyd Bateman, Shawn Beaton, Jarod Joseph, Reilly Dolman, Paul Cummings, Julie Brar, Dejan Loyola, Damian Arman, Mario Casoria, Dorla Bell, Carolyn Adair (II), Jade Pawluk, G. Patrick Currie, Darian Arman, Mariela Zapata, David L. Smith
atherine Keener, Amanda Peet, Oliver Platt, Rebecca Hall, Sarah Steele, Ann Morgan Guilbert, Kevin Corrigan, Lois Smith
udley Moore, Bo Derek, Julie Andrews, Robert Webber, Dee Wallace, Sam Jones III, Sam J. Jones, Brian Dennehy, Max Showalter, Rad Daly, Nedra Volz, James Noble, Virginia Kiser, John Hawker, Deborah Rush, Don Calfa, Walter George Alton, Annette Martin, John Hancock, Lorry Goldman, Arthur Rosenberg, Mari Gorman, Marcy Hanson, Julia Jennings, Senilo Tanney, Kitty DeCarlo, William Lucking, Owen Sullivan, Debbie White, Laurence Carr, Camila Ashland, Adrian Aron, Gail Bowman, Burke Byrnes, Michael Champion, Doug Sheehan, J. Victor Lopez, Gregory Chase, Jon Linton, Ellen Clark, Antonia Ellis, John Chappell, Lynn Farrell, Art Kassul, Denise Crosby, Jeanetta Arnette
artin Balsam, John Fiedler, Lee J. Cobb, E.G. Marshall, Jack Klugman, Edward Binns, Jack Warden, Henry Fonda, Joseph Sweeney, Ed Begley Sr., George Voskovec, Robert Webber, Rudy Bond, James A. Kelly, Billy Nelson, John Savoca
ames Mason, Kirk Douglas, Paul Lukas, Peter Lorre, Robert J. Wilke, Carleton Young, Bob Wilke, Ted de Corsia, Percy Helton, Ted Cooper, Eddie Marr, Fred Graham, Esmeralda, J.M. Kerrigan, Harry Harvey, Herb Vigran
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Steven Strait, Camilla Belle, Cliff Curtis, Joel Virgel, Affif Ben Badra, Mo Zainal, Nathanael Baring, Marco Khan, Reece Ritchie, Joel Fry, Junior Oliphant, Louise Tu'u, David Dennis, Kristian Beazley, Mona Hammond, Jacob Renton, Grayson Hunt Urwin, Fahruq Valley-Omar, Boubacar Badaine, Joe Vaz, Charles Baloyi, Tim Barlow, Gabriel Malema, Mark Simmons, Hannah Westbury, Affif Ben Badra, Mo Zinal, Omar Sharif, Farouk Valley-Omar
##   original_release_date streaming_release_date runtime     production_company
## 1             2/12/2010             11/25/2015     119       20th Century Fox
## 2             4/30/2010               9/4/2012      90 Sony Pictures Classics
## 3             10/5/1979              7/24/2014     122            Waner Bros.
## 4             4/13/1957              1/13/2017      95   Criterion Collection
## 5              1/1/1954              6/10/2016     127                 Disney
## 6              3/7/2008              6/22/2013     109  Warner Bros. Pictures
##   tomatometer_status tomatometer_rating tomatometer_count audience_status
## 1             Rotten                 49               149         Spilled
## 2    Certified-Fresh                 87               142         Upright
## 3              Fresh                 67                24         Spilled
## 4    Certified-Fresh                100                54         Upright
## 5              Fresh                 89                27         Upright
## 6             Rotten                  8               149         Spilled
##   audience_rating audience_count tomatometer_top_critics_count
## 1              53         254421                            43
## 2              64          11574                            44
## 3              53          14684                             2
## 4              97         105386                             6
## 5              74          68918                             5
## 6              37         411140                            37
##   tomatometer_fresh_critics_count tomatometer_rotten_critics_count
## 1                              73                               76
## 2                             123                               19
## 3                              16                                8
## 4                              54                                0
## 5                              24                                3
## 6                              12                              137
# I'm going to remove some columns and retain the data that I really want to use for sentiment analysis
dataset <- subset(reviews, select = c(movie_title, critics_consensus))

head(dataset) 
##                                          movie_title
## 1 Percy Jackson & the Olympians: The Lightning Thief
## 2                                        Please Give
## 3                                                 10
## 4                    12 Angry Men (Twelve Angry Men)
## 5                       20,000 Leagues Under The Sea
## 6                                        10,000 B.C.
##                                                                                                                                                                   critics_consensus
## 1          Though it may seem like just another Harry Potter knockoff, Percy Jackson benefits from a strong supporting cast, a speedy plot, and plenty of fun with Greek mythology.
## 2 Nicole Holofcener's newest might seem slight in places, but its rendering of complex characters in a conflicted economic landscape is varied, natural, and touching all the same.
## 3                             Blake Edwards' bawdy comedy may not score a perfect 10, but Dudley Moore's self-deprecating performance makes this midlife crisis persistently funny.
## 4                                         Sidney Lumet's feature debut is a superbly written, dramatically effective courtroom thriller that rightfully stands as a modern classic.
## 5          One of Disney's finest live-action adventures, 20,000 Leagues Under the Sea brings Jules Verne's classic sci-fi tale to vivid life, and features an awesome giant squid.
## 6                             With attention strictly paid to style instead of substance, or historical accuracy, 10,000 B.C. is a visually impressive but narratively flimsy epic.

First, I’ll tokenize the critics’ consensus

token <- dataset %>%
  unnest_tokens(word, 2) %>%
  anti_join(stop_words)
## Joining, by = "word"
token_count <- token %>%
  count(word) %>%
  arrange(desc(n))

head(token_count)
##           word   n
## 1        story 827
## 2         cast 709
## 3  performance 677
## 4 performances 676
## 5     director 641
## 6        movie 542

There are some words at the top here that don’t add much to our analysis, so I’ll customize my stop word list to improve the analysis.

new_stop <- data.frame(word = c("story", "cast", "performance","performances", "director", "movie", "film", "life", "offers", "makes", "characters", "fans", "viewers", "genre", "direction", "writer", "script", "tale", "ultimately", "material", "effects", "character", "time", "subject", "features", "title", "plot", "comedy", "drama", "thriller", "action", "horror", "acted", "premise"), lexicon = "custom")
my_stopwords <- rbind(new_stop, stop_words)

critic <- token %>%
  filter(!word %in% my_stopwords$word)

critic_count <- critic %>%
  count(word, sort = TRUE)

head(critic_count)
##           word   n
## 1        funny 348
## 2       strong 313
## 3 entertaining 293
## 4        humor 286
## 5      classic 265
## 6     talented 242

That’s a pretty good collection of sentiment. Let’s introduce a lexicon to bucket the sentiments as either positive or negative.

get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
# I'll now run my sentiment dataset against bing
critic_bing_pct <- critic %>% 
  inner_join(get_sentiments("bing")) %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment) %>%
  mutate(total = n / sum(n))
## Joining, by = "word"
critic_bing_pct
##   sentiment     n     total
## 1  negative 11547 0.4280154
## 2  positive 15431 0.5719846

This is especially surprising. The critics’ job is criticism after all. Let’s take a look at this graphically.

bing_word_counts <- critic %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Pretty strange that bing buckets “funny” as a negative, and it’s especially a shame that it contributes disproportionately to the negative sentiment, but our analysis at least highlights a problem with using bing for movie review purposes!